# DATA LOADING + DATA CLEANING 

In [None]:
# imdb_chunked_merge_azure.py
import pandas as pd
import gc

pd.options.mode.chained_assignment = None  # suppress SettingWithCopyWarning

# =======================
# 1️⃣ Azure URLs for your IMDb .tsv.gz files
# Replace <account>, <container>, and SAS token if needed
# =======================
URL_NAME_BASICS    = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104122_UTC/name.basics.tsv.gz"
URL_TITLE_AKAS     = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104546_UTC/title.akas.tsv.gz"
URL_TITLE_BASICS   = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104810_UTC/title.basics.tsv.gz"
URL_TITLE_CREW     = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104937_UTC/title.crew.tsv.gz"
URL_TITLE_EPISODE  = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105103_UTC/title.episode.tsv.gz"
URL_TITLE_PRINC    = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105225_UTC/title.principals.tsv.gz"
URL_TITLE_RATINGS  = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/title.ratings.tsv.gz"

# =======================
# 2️⃣ Helper function to read TSV from URL
# =======================
def load_tsv(url, **kwargs):
    return pd.read_csv(url, sep='\t', na_values='\\N', compression='gzip', low_memory=False, **kwargs)

# =======================
# 3️⃣ Load smaller tables into memory first
# These are manageable in RAM: basics, ratings, crew, episode, name.basics
# =======================
print("Loading smaller datasets...")
title_basics = load_tsv(URL_TITLE_BASICS)
title_ratings = load_tsv(URL_TITLE_RATINGS)
title_crew = load_tsv(URL_TITLE_CREW)
title_episode = load_tsv(URL_TITLE_EPISODE)
name_basics = load_tsv(URL_NAME_BASICS)

# =======================
# 4️⃣ Merge basics + ratings + crew + episode
# This creates a master table: one row per title (tconst)
# =======================
print("Merging basics + ratings + crew + episode...")
merged = (
    title_basics
    .merge(title_ratings, on='tconst', how='left')
    .merge(title_crew, on='tconst', how='left')
    .merge(title_episode, on='tconst', how='left')
)

# Free memory
del title_basics, title_ratings, title_crew, title_episode
gc.collect()

# =======================
# 5️⃣ Build set of all tconst
# This helps filter only relevant rows in big files (akas and principals)
# =======================
tconst_set = set(merged['tconst'].unique())
print(f"Master table has {len(tconst_set)} unique titles.")

# =======================
# 6️⃣ Process title.akas in chunks
# Only keep rows where titleId is in tconst_set
# =======================
print("Processing title.akas in chunks...")
akas_cols = ['titleId','ordering','title','region','language','types','attributes','isOriginalTitle']
akas_chunks = pd.read_csv(URL_TITLE_AKAS, sep='\t', na_values='\\N', compression='gzip',
                          usecols=akas_cols, chunksize=300_000, low_memory=False)
akas_parts = []
for chunk in akas_chunks:
    chunk = chunk[chunk['titleId'].isin(tconst_set)]  # filter relevant titles
    if not chunk.empty:
        akas_parts.append(chunk)
    del chunk
gc.collect()

if akas_parts:
    akas_df = pd.concat(akas_parts, ignore_index=True)
    # Merge: many-to-one relationship (one tconst may have multiple akas)
    merged = merged.merge(akas_df, left_on='tconst', right_on='titleId', how='left')
    del akas_df, akas_parts
gc.collect()

# =======================
# 7️⃣ Process title.principals in chunks
# Only keep rows where tconst exists in master table
# =======================
print("Processing title.principals in chunks...")
princ_cols = ['tconst','ordering','nconst','category','job','characters']
princ_chunks = pd.read_csv(URL_TITLE_PRINC, sep='\t', na_values='\\N', compression='gzip',
                           usecols=princ_cols, chunksize=300_000, low_memory=False)
princ_parts = []
for chunk in princ_chunks:
    chunk = chunk[chunk['tconst'].isin(tconst_set)]  # filter relevant titles
    if not chunk.empty:
        princ_parts.append(chunk)
    del chunk
gc.collect()

if princ_parts:
    principals_df = pd.concat(princ_parts, ignore_index=True)
    # Merge: many-to-one (explodes rows if multiple principals per title)
    merged = merged.merge(principals_df, on='tconst', how='left')
    del principals_df, princ_parts
gc.collect()

# =======================
# 8️⃣ Merge name.basics on nconst to get person info for principals
# =======================
print("Merging name.basics for principals...")
merged = merged.merge(name_basics, on='nconst', how='left')
del name_basics
gc.collect()

# =======================
# 9️⃣ Final dataset info
# =======================
print("Final merged shape:", merged.shape)
print(merged.info(memory_usage='deep'))

# =======================
# 10️⃣ Save final merged dataset
# Save as Parquet (recommended) and compressed CSV (optional)
# =======================
print("Saving merged dataset...")
merged.to_parquet("imdb_merged_chunked.parquet", engine='pyarrow', compression='snappy', index=False)
print("✅ Done! You can reload Parquet fast with: pd.read_parquet('imdb_merged_chunked.parquet')")


In [None]:
import duckdb
import time
from tqdm import tqdm

# =======================
#  Azure URLs
# =======================
URL_NAME_BASICS    = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104122_UTC/name.basics.tsv.gz"
URL_TITLE_AKAS     = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104546_UTC/title.akas.tsv.gz"
URL_TITLE_BASICS   = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104810_UTC/title.basics.tsv.gz"
URL_TITLE_CREW     = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104937_UTC/title.crew.tsv.gz"
URL_TITLE_EPISODE  = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105103_UTC/title.episode.tsv.gz"
URL_TITLE_PRINC    = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105225_UTC/title.principals.tsv.gz"
URL_TITLE_RATINGS  = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/title.ratings.tsv.gz"

# =======================
#  Start DuckDB connection
# =======================
con = duckdb.connect(database=':memory:')
start_time = time.time()
print(" Starting IMDb data merge using DuckDB...\n")

# =======================
#  Register TSV.GZ files as virtual tables
# =======================
print("1. Registering remote files as views...")
base_read_csv = "SELECT * FROM read_csv('{}', delim='\\t', nullstr='\\\\N', header=True, compression='gzip', auto_detect=True, parallel=True)"

con.execute(f"CREATE OR REPLACE VIEW name_basics AS {base_read_csv.format(URL_NAME_BASICS)};")
con.execute(f"CREATE OR REPLACE VIEW title_basics AS {base_read_csv.format(URL_TITLE_BASICS)};")
con.execute(f"CREATE OR REPLACE VIEW title_ratings AS {base_read_csv.format(URL_TITLE_RATINGS)};")
con.execute(f"CREATE OR REPLACE VIEW title_crew AS {base_read_csv.format(URL_TITLE_CREW)};")
con.execute(f"CREATE OR REPLACE VIEW title_episode AS {base_read_csv.format(URL_TITLE_EPISODE)};")
con.execute(f"CREATE OR REPLACE VIEW title_akas AS {base_read_csv.format(URL_TITLE_AKAS)};")
con.execute(f"CREATE OR REPLACE VIEW title_principals AS {base_read_csv.format(URL_TITLE_PRINC)};")

# =======================
#  ⭐️ NEW: Create a FILTERED view of title_basics
# =======================
print("2. Filtering title_basics to main titles (movie, tvSeries, etc.)...")
con.execute("""
CREATE OR REPLACE VIEW basics_filtered AS
SELECT *
FROM title_basics
WHERE titleType IN ('movie', 'tvSeries', 'tvMiniSeries', 'tvSpecial');
""")
count_result = con.execute("SELECT COUNT(*) FROM basics_filtered").fetchone()
print(f"   Filtered down to {count_result[0]:,} titles (from ~12 million).")

# =======================
#  NEW: Run ONE optimized query
# =======================
print("3. Running optimized join query...")

# This single, explicit query avoids all duplicate columns
final_query = """
COPY (
    SELECT 
        -- From title_basics (aliased as 'b')
        b.tconst, 
        b.titleType, 
        b.primaryTitle, 
        b.originalTitle, 
        b.isAdult, 
        b.startYear, 
        b.endYear, 
        b.runtimeMinutes, 
        b.genres,
        
        -- From title_ratings (aliased as 'r')
        r.averageRating, 
        r.numVotes,
        
        -- From title_crew (aliased as 'c')
        c.directors, 
        c.writers,
        
        -- From title_episode (aliased as 'e')
        e.parentTconst, 
        e.seasonNumber, 
        e.episodeNumber,
        
        -- From title_akas (aliased as 'a')
        a.ordering AS ordering_akas, 
        a.title AS title_akas, 
        a.region, 
        a.language, 
        a.types, 
        a.attributes, 
        a.isOriginalTitle,
        
        -- From title_principals (aliased as 'p')
        p.ordering AS ordering_principal, 
        p.nconst, 
        p.category, 
        p.job, 
        p.characters,
        
        -- From name_basics (aliased as 'n')
        n.primaryName, 
        n.birthYear, 
        n.deathYear, 
        n.primaryProfession, 
        n.knownForTitles

    FROM basics_filtered AS b
    LEFT JOIN title_ratings AS r ON b.tconst = r.tconst
    LEFT JOIN title_crew AS c ON b.tconst = c.tconst
    LEFT JOIN title_episode AS e ON b.tconst = e.tconst
    LEFT JOIN title_akas AS a ON b.tconst = a.titleId
    LEFT JOIN title_principals AS p ON b.tconst = p.tconst
    LEFT JOIN name_basics AS n ON p.nconst = n.nconst
) 
TO 'imdb_merged_duckdb_FILTERED.parquet' (FORMAT PARQUET, COMPRESSION 'SNAPPY', ROW_GROUP_SIZE 100000);
"""

con.execute(final_query)

# =======================
#  Clean up
# =======================
con.close()
elapsed = (time.time() - start_time)
print(f"\n✅ Done! Merged dataset saved as imdb_merged_duckdb_FILTERED.parquet (Elapsed: {elapsed:.2f} seconds)")
print("   This file is filtered and ready for your cleaning pipeline.")

In [None]:
import duckdb
import time
from tqdm import tqdm

# =======================
#  Azure URLs
# =======================
URL_NAME_BASICS    = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104122_UTC/name.basics.tsv.gz"
URL_TITLE_AKAS     = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104546_UTC/title.akas.tsv.gz"
URL_TITLE_BASICS   = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104810_UTC/title.basics.tsv.gz"
URL_TITLE_CREW     = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104937_UTC/title.crew.tsv.gz"
URL_TITLE_EPISODE  = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105103_UTC/title.episode.tsv.gz"
URL_TITLE_PRINC    = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105225_UTC/title.principals.tsv.gz"
URL_TITLE_RATINGS  = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/title.ratings.tsv.gz"

# =======================
#  Start DuckDB connection
# =======================
# Connect to an in-memory database or specify a file: database='imdb_merge.duckdb'
con = duckdb.connect(database=':memory:')

start_time = time.time()
print(" Starting IMDb data merge using DuckDB...\n")

# =======================
#  Stage progress tracker
# =======================
stages = [
    "Registering IMDb files",
    "Joining basics + ratings + crew + episode",
    "Joining akas",
    "Joining principals",
    "Joining name.basics",
    "Exporting to Parquet"
]
progress = tqdm(total=len(stages), desc="Progress", ncols=80, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')

# =======================
#  Register TSV.GZ files as virtual tables
# =======================
# DuckDB can read directly from HTTPS URLs and handle compressed files.
# 'auto_detect=True' helps with schema, but we specify key params.
base_read_csv = "SELECT * FROM read_csv('{}', delim='\\t', nullstr='\\\\N', header=True, compression='gzip', auto_detect=True, parallel=True)"

con.execute(f"CREATE OR REPLACE VIEW name_basics AS {base_read_csv.format(URL_NAME_BASICS)};")
con.execute(f"CREATE OR REPLACE VIEW title_basics AS {base_read_csv.format(URL_TITLE_BASICS)};")
con.execute(f"CREATE OR REPLACE VIEW title_ratings AS {base_read_csv.format(URL_TITLE_RATINGS)};")
con.execute(f"CREATE OR REPLACE VIEW title_crew AS {base_read_csv.format(URL_TITLE_CREW)};")
con.execute(f"CREATE OR REPLACE VIEW title_episode AS {base_read_csv.format(URL_TITLE_EPISODE)};")
con.execute(f"CREATE OR REPLACE VIEW title_akas AS {base_read_csv.format(URL_TITLE_AKAS)};")
con.execute(f"CREATE OR REPLACE VIEW title_principals AS {base_read_csv.format(URL_TITLE_PRINC)};")

progress.update(1)
progress.set_description(stages[1])

# =======================
#  Perform joins step by step
# =======================
# This step-by-step materialization helps manage memory
con.execute("""
CREATE OR REPLACE TABLE merged_core AS
SELECT *
FROM title_basics b
LEFT JOIN title_ratings r USING (tconst)
LEFT JOIN title_crew c USING (tconst)
LEFT JOIN title_episode e USING (tconst);
""")
progress.update(1)
progress.set_description(stages[2])

con.execute("""
CREATE OR REPLACE TABLE merged_with_akas AS
SELECT *
FROM merged_core mc
LEFT JOIN title_akas a ON mc.tconst = a.titleId;
""")
progress.update(1)
progress.set_description(stages[3])

con.execute("""
CREATE OR REPLACE TABLE merged_with_principals AS
SELECT *
FROM merged_with_akas ma
LEFT JOIN title_principals p ON ma.tconst = p.tconst;
""")
progress.update(1)
progress.set_description(stages[4])

con.execute("""
CREATE OR REPLACE TABLE imdb_final AS
SELECT *
FROM merged_with_principals mp
LEFT JOIN name_basics n ON mp.nconst = n.nconst;
""")
progress.update(1)
progress.set_description(stages[5])

# =======================
#  Export to Parquet
# =======================
con.execute("""
COPY (SELECT * FROM imdb_final) 
TO 'imdb_merged_duckdb.parquet' (FORMAT PARQUET, COMPRESSION 'SNAPPY', ROW_GROUP_SIZE 100000);
""")
progress.update(1)
progress.close()

# =======================
#  Clean up
# =======================
con.close()
elapsed = (time.time() - start_time)
print(f"\n✅ Done! Merged dataset saved as imdb_merged_duckdb.parquet (Elapsed: {elapsed:.2f} seconds)")
print("   Reload it fast with: pd.read_parquet('imdb_merged_duckdb.parquet')")

# Brief Summary of Our Whole Process of Merging IMDb Datasets

## 1. GOAL
Merge all datasets into one master table, so that:
- Each row contains all information for a title (or exploded rows for multiple actors or alternate titles).
- Later, We can clean and analyze the data.

## 2. CHALLENGES
- Some files (`title.akas` and `title.principals`) are very large (5–10 GB).
- Merging everything at once may use too much RAM and crash the computer.
- Many rows in `akas` and `principals` may not be relevant to the titles in the main dataset.

## 3. APPROACH (CHUNKED MERGE)
1. Load small datasets into memory first: `title.basics`, `title.ratings`, `title.crew`, `title.episode`, `name.basics`.
2. Merge small datasets on `tconst` → master table with one row per title.
3. Build a set of relevant titles (`tconst_set`) for fast filtering.
4. Process big datasets (`title.akas` and `title.principals`) in chunks (~300k rows), filter by `tconst_set`, then merge.
5. Merge `name.basics` to attach actor/director info using `nconst`.
6. Save final dataset (Parquet recommended, CSV optional).

## 4. HOW MERGING WORKS
- `df_left.merge(df_right, left_on="colA", right_on="colB", how="left")`:
  - Left table = main/master table.
  - Right table = extra info (ratings, actors, alternate titles).
  - `how="left"` keeps all rows from the left table; missing matches → NaN.

## 5. ADVANTAGES OF CHUNKED MERGE
- Memory-efficient – never load huge tables fully.
- Fast filtering – only process rows relevant to `tconst_set`.
- Safe – reduces risk of crashing.
- Complete – preserves all relevant info, including multiple actors and alternate titles.

## 6. NOTES AFTER MERGE
- Multiple rows per title are normal:
  - One row per actor (`title.principals`).
  - One row per alternate title (`title.akas`).
- Later during data cleaning, you can:
  - Deduplicate rows.
  - Aggregate actors or alternate titles into lists.
  - Normalize genres or other fields.

# First steps on Data Exploration

In [None]:
import pandas as pd
import numpy as np
import json
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
# Load the dataset you created in the first script
try:
    df_raw = pd.read_parquet('imdb_merged_chunked.parquet')
except FileNotFoundError:
    print("Error: 'imdb_merged_chunked.parquet' not found.")
    print("Please run your 'imdb_chunked_merge_azure.py' script first.")
    # As a fallback, create a dummy dataframe for the script to run
    df_raw = pd.DataFrame({
        'tconst': ['tt000001'], 'primaryTitle': ['Movie Title'], 'startYear': [2000],
        'averageRating': [8.0], 'numVotes': [100], 'genres': ['Action,Drama'],
        'directors': ['nm000001'], 'writers': ['nm000002'], 'parentTconst': [np.nan],
        'seasonNumber': [np.nan], 'episodeNumber': [np.nan], 'titleId': ['tt000001'],
        'ordering_x': [1], 'title': ['movie title'], 'region': ['US'], 'language': ['en'],
        'isOriginalTitle': [0], 'ordering_y': [1], 'nconst': ['nm000003'],
        'category': ['actor'], 'job': [np.nan], 'characters': ['["Lead Role"]'],
        'primaryName': ['Actor Name'], 'birthYear': [1980], 'deathYear': [np.nan]
    })

print("="*30)
print("1. DATA INFO")
print("="*30)
# Use verbose=True to see all columns
df_raw.info(verbose=True, memory_usage='deep')

print("\n" + "="*30)
print("2. MISSING VALUES (Top 20)")
print("="*30)
missing_values = df_raw.isnull().sum()
missing_percent = (missing_values / len(df_raw) * 100).round(2)
missing_df = pd.DataFrame({'count': missing_values, 'percent': missing_percent})
print(missing_df[missing_df['count'] > 0].sort_values(by='count', ascending=False).head(20))

print("\n" + "="*30)
print("3. DUPLICATE ROWS")
print("="*30)
num_dupes = df_raw.duplicated().sum()
print(f"Number of duplicate rows: {num_dupes}")
# Note: Duplicates might be expected if a title has multiple actors/akas

print("\n" + "="*30)
print("4. CATEGORICAL CARDINALITY (Top 20)")
print("="*30)
categorical_cols = df_raw.select_dtypes(include=['object', 'category']).columns
cardinality = {col: df_raw[col].nunique() for col in categorical_cols}
print("Number of unique values in categorical columns:")
# Sort by cardinality (highest first) to find problematic columns
sorted_cardinality = sorted(cardinality.items(), key=lambda x: x[1], reverse=True)
for col, count in sorted_cardinality[:20]:
    print(f"{col}: {count}")

print("\n" + "="*30)
print("5. NUMERIC DISTRIBUTION")
print("="*30)
# This helps spot outliers and skew
numeric_cols = df_raw.select_dtypes(include=np.number).columns
print(df_raw[numeric_cols].describe().T)

# Cleaning Pipeline 

In [None]:
# ==================================
# 1. DEFINE CUSTOM TRANSFORMERS
# ==================================

class DropColumnTransformer(BaseEstimator, TransformerMixin):
    """Drops specified columns."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns=self.columns, axis=1, errors='ignore')

class StringCleaner(BaseEstimator, TransformerMixin):
    """Strips whitespace, lowercases, and converts to string."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].astype(str).str.strip().str.lower()
        return X_transformed

class ListStringCleaner(BaseEstimator, TransformerMixin):
    """Cleans comma-separated list-strings (e.g., genres)."""
    def __init__(self, columns, separator=','):
        self.columns = columns
        self.separator = separator
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                # Fill NaNs with empty string before processing
                X_transformed[col] = X_transformed[col].fillna('')
                # Split, strip, lowercase, and re-join
                X_transformed[col] = X_transformed[col].apply(
                    lambda s: self.separator.join(
                        [item.strip().lower() for item in str(s).split(self.separator)]
                    ) if s else '' # Handle empty strings
                )
        return X_transformed

class JSONStringParser(BaseEstimator, TransformerMixin):
    """Parses columns containing string representations of JSON lists."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def _parse(self, item):
        if pd.isna(item):
            return []
        try:
            # Safely evaluate the string as a list
            parsed_list = json.loads(item)
            if isinstance(parsed_list, list):
                # Clean and join the list elements
                return ','.join([str(i).strip().lower() for i in parsed_list])
            return '' # Not a list
        except (json.JSONDecodeError, TypeError, SyntaxError):
            return '' # Return empty string on parsing error
            
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].apply(self._parse)
        return X_transformed

class CustomOutlierRemover(BaseEstimator, TransformerMixin):
    """Removes rows based on Z-score of specified numeric columns."""
    def __init__(self, columns, threshold=3):
        self.threshold = threshold
        self.columns = columns
        self._outliers = None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        # Ensure columns exist and are numeric
        valid_cols = [col for col in self.columns if col in X.columns and pd.api.types.is_numeric_dtype(X[col])]
        if not valid_cols:
            print("Warning: No valid numeric columns found for outlier removal.")
            return X_transformed
            
        z_scores = np.abs(stats.zscore(X_transformed[valid_cols], nan_policy='omit'))
        # Create a boolean mask for rows *without* outliers
        mask = (z_scores < self.threshold).all(axis=1)
        self._outliers = X_transformed[~mask]
        return X_transformed[mask]
    @property
    def outliers(self):
        return self._outliers

# ==================================
# 2. DEFINE COLUMN GROUPS
# (Based on the merge script)
# ==================================

# Columns to remove: Redundant IDs, low-value, or messy
DROP_COLS = [
    'titleId',       # Redundant with tconst
    'ordering_x',    # ordering from akas
    'ordering_y',    # ordering from principals
    'isOriginalTitle', # This often appears twice (e.g., isOriginalTitle_x, isOriginalTitle_y)
    'attributes',    # Often sparse or noisy
    'job'            # Often sparse, 'category' is more useful
]

# Simple strings to clean (lowercase, strip)
CLEAN_STRING_COLS = [
    'primaryTitle',
    'originalTitle',
    'title',
    'primaryName'
]

# Numeric columns to impute (with median)
NUMERIC_IMPUTE_COLS = [
    'startYear',
    'endYear',
axr'runtimeMinutes',
    'averageRating',
    'numVotes',
    'seasonNumber',
    'episodeNumber',
    'birthYear',
    'deathYear'
]

# Categorical columns to impute (with 'unknown')
CATEGORICAL_IMPUTE_COLS = [
    'titleType',
    'isAdult',
    'region',
    'language',
    'types',
    'category'
]

# Comma-separated list-strings
LIST_STRING_COLS = [
    'genres',
    'directors',
    'writers',
    'primaryProfession',
    'knownForTitles'
]

# JSON-like list-strings
JSON_STRING_COLS = [
    'characters'
]

# Numeric columns to check for outliers
OUTLIER_COLS = [
    'runtimeMinutes',
    'numVotes',
    'averageRating',
    'startYear'
]

# ==================================
# 3. BUILD THE PREPROCESSING PIPELINE
# ==================================

# Define imputers
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='constant', fill_value='unknown')

# Create the main preprocessing pipeline using ColumnTransformer
# This applies specific transformers to specific column groups
preprocessing_pipeline = Pipeline(steps=[
    ('drop_cols', DropColumnTransformer(columns=DROP_COLS)),
    
    ('clean_strings', StringCleaner(columns=CLEAN_STRING_COLS)),
    
    ('clean_list_strings', ListStringCleaner(columns=LIST_STRING_COLS)),
    
    ('parse_json_strings', JSONStringParser(columns=JSON_STRING_COLS)),
    
    ('impute_features', ColumnTransformer(
        transformers=[
            ('num_impute', numeric_imputer, NUMERIC_IMPUTE_COLS),
            ('cat_impute', categorical_imputer, CATEGORICAL_IMPUTE_COLS)
        ],
        remainder='passthrough' # Keep all other columns not specified
    ))
])

# ⚠️ WARNING: Outlier removal should generally be done *ONLY* on the 
# training set *after* splitting to avoid data leakage.
# We define it separately.
outlier_remover = CustomOutlierRemover(columns=OUTLIER_COLS, threshold=3)


# ==================================
# 4. EXAMPLE USAGE
# ==================================

print("\n" + "="*30)
print("6. APPLYING CLEANING PIPELINE")
print("="*30)

# We use the 'df_raw' loaded in the exploration step
print(f"Shape before cleaning: {df_raw.shape}")

# Fit and transform the data
# Note: This modifies the column order and converts imputed columns to a NumPy array
# We need to reconstruct the DataFrame

# Get column names *after* imputation
# 1. Columns from numeric imputer
# 2. Columns from categorical imputer
# 3. 'remainder' columns
col_transformer = preprocessing_pipeline.named_steps['impute_features']

# Manually get remainder columns
imputed_cols = NUMERIC_IMPUTE_COLS + CATEGORICAL_IMPUTE_COLS
all_cols_after_parse = preprocessing_pipeline.named_steps['parse_json_strings'].transform(
    preprocessing_pipeline.named_steps['clean_list_strings'].transform(
        preprocessing_pipeline.named_steps['clean_strings'].transform(
            preprocessing_pipeline.named_steps['drop_cols'].transform(df_raw)
        )
    )
).columns
remainder_cols = [col for col in all_cols_after_parse if col not in imputed_cols]

# Define the final column order
final_cols = NUMERIC_IMPUTE_COLS + CATEGORICAL_IMPUTE_COLS + remainder_cols

# Apply the pipeline
df_cleaned_data = preprocessing_pipeline.fit_transform(df_raw)
df_cleaned = pd.DataFrame(df_cleaned_data, columns=final_cols)

# Convert numeric columns back to numeric (imputer output is object)
for col in NUMERIC_IMPUTE_COLS:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col])

print(f"Shape after cleaning: {df_cleaned.shape}")
print("\nCleaned Data Info:")
df_cleaned.info()

print("\nMissing values after cleaning:")
missing_after = df_cleaned.isnull().sum()
print(missing_after[missing_after > 0])

# Example of using the outlier remover (e.g., on a training set)
# df_train_no_outliers = outlier_remover.fit_transform(df_cleaned)
# print(f"\nShape after outlier removal: {df_train_no_outliers.shape}")
# print(f"Removed {len(outlier_remover.outliers)} outlier rows.")

# Key transformation
1. Drop Columns (DropColumnTransformer)
- t removes ordering_akas, ordering_principal, attributes, and job
- These are redundant keys or ordering columns from the original files that are not needed after merging on tconst and nconst. They add noise and increase memory usage

2. Standardize Strings (StringCleaner)
- Lowercases and strips whitespace from free-text columns like primaryTitle and primaryName
- Ensures consistency. "The Matrix", " the matrix ", and "the matrix" become identical ("the matrix"), which prevents the model from treating them as different categories.

3. Clean List-Strings (ListStringCleaner)
- Handles comma-separated columns like genres and directors. It turns 'Action, Adventure' into 'action,adventure'
- This is crucial for feature engineering.

4. Parse JSON-Strings (JSONStringParser)
- Specifically targets the characters column, which is often a messy string like '["Walter White", "Heisenberg"]'. It parses this into a clean, comma-separated string: 'walter white,heisenberg'
- This "unpacks" the valuable information (who an actor played) from a format that is otherwise unusable.

5. Impute Missing Values (SimpleImputer)
* Fills in missing data.
- For numeric our strategy is "median", so we fill missing values with median 
- For categorical ones our strategy is "constant", we fill each null value with string "unknown"

!!!CustomOutlierRemoval is included, but separated from the main pipeline.

In [None]:
import duckdb
#main part
url = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/imdb_merged_duckdb.parquet"

#sample 
#url = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-25_160707_UTC/imdb_sample_100k.parquet"
con = duckdb.connect()

print("Loading data into pandas DataFrame using DuckDB...")

# ⭐️ This is the line you need to add:
df = con.execute(f"SELECT * FROM read_parquet('{url}')").df()

con.close()

print("✅ Data loaded successfully.")
print(df.head())
print(f"DataFrame shape: {df.shape}")

# Cleaning done with pandas

In [None]:
import pandas as pd
import numpy as np
import json
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import warnings

# Suppress warnings for cleaner output
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore', category=FutureWarning)

# ==================================
# 1. DEFINE CUSTOM TRANSFORMERS
# ==================================

class DropColumnTransformer(BaseEstimator, TransformerMixin):
    """Drops specified columns."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns=self.columns, axis=1, errors='ignore')

class StringCleaner(BaseEstimator, TransformerMixin):
    """Strips whitespace, lowercases, and converts to string."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].astype(str).str.strip().str.lower()
        return X_transformed

class ListStringCleaner(BaseEstimator, TransformerMixin):
    """Cleans comma-separated list-strings (e.g., genres)."""
    def __init__(self, columns, separator=','):
        self.columns = columns
        self.separator = separator
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna('')
                X_transformed[col] = X_transformed[col].apply(
                    lambda s: self.separator.join(
                        [item.strip().lower() for item in str(s).split(self.separator)]
                    ) if s else ''
                )
        return X_transformed

class JSONStringParser(BaseEstimator, TransformerMixin):
    """Parses columns containing string representations of JSON lists."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def _parse(self, item):
        if pd.isna(item):
            return ''  # ⭐️⭐️⭐️ THE FIX IS HERE ⭐️⭐️⭐️ (Was '[]', now '""')
        try:
            parsed_list = json.loads(item)
            if isinstance(parsed_list, list):
                return ','.join([str(i).strip().lower() for i in parsed_list])
            return ''
        except (json.JSONDecodeError, TypeError, SyntaxError):
            # Handle cases where it's not a valid JSON list (e.g., just "ActorName")
            return str(item).strip().lower()
            
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].apply(self._parse)
        return X_transformed

class CustomOutlierRemover(BaseEstimator, TransformerMixin):
    """Removes rows based on Z-score of specified numeric columns."""
    def __init__(self, columns, threshold=3):
        self.threshold = threshold
        self.columns = columns
        self._outliers = None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        valid_cols = [col for col in self.columns if col in X.columns and pd.api.types.is_numeric_dtype(X[col])]
        if not valid_cols:
            print("Warning: No valid numeric columns found for outlier removal.")
            return X_transformed
        
        # Ensure NaNs are handled before zscore
        z_scores = np.abs(stats.zscore(X_transformed[valid_cols], nan_policy='omit'))
        
        # Fill NaNs in z_scores with 0 (so they aren't considered outliers)
        z_scores_filled = np.nan_to_num(z_scores, nan=0)
        
        mask = (z_scores_filled < self.threshold).all(axis=1)
        self._outliers = X_transformed[~mask]
        return X_transformed[mask]
    @property
    def outliers(self):
        return self._outliers

# ==================================
# 2. DEFINE COLUMN GROUPS
# ==================================

# Columns to remove: Redundant IDs, low-value, or messy
DROP_COLS = [
    'ordering_akas', 
    'ordering_principal',
    'attributes',
    'job'
]

# Simple strings to clean (lowercase, strip)
CLEAN_STRING_COLS = [
    'primaryTitle',
    'originalTitle',
    'title_akas', # Renamed from 'title'
    'primaryName'
]

# Numeric columns to impute (with median)
NUMERIC_IMPUTE_COLS = [
    'startYear',
    'endYear',
    'runtimeMinutes',
    'averageRating',
    'numVotes',
    'seasonNumber',
    'episodeNumber',
    'birthYear',
    'deathYear'
]

# Categorical columns to impute (with 'unknown')
CATEGORICAL_IMPUTE_COLS = [
    'titleType',
    'isAdult',
    'region',
    'language',
    'types',
    'category'
]

# Comma-separated list-strings
LIST_STRING_COLS = [
    'genres',
    'directors',
    'writers',
    'primaryProfession',
    'knownForTitles'
]

# JSON-like list-strings
JSON_STRING_COLS = [
    'characters'
]

# Numeric columns to check for outliers
OUTLIER_COLS = [
    'runtimeMinutes',
    'numVotes',
    'averageRating',
    'startYear'
]

# ==================================
# 3. BUILD THE CLEANING PIPELINE
# ==================================

print("Building cleaning pipeline...")

# Define imputers
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='constant', fill_value='unknown')

# Create the main cleaning pipeline
cleaning_pipeline = Pipeline(steps=[
    ('drop_cols', DropColumnTransformer(columns=DROP_COLS)),
    ('clean_strings', StringCleaner(columns=CLEAN_STRING_COLS)),
    ('clean_list_strings', ListStringCleaner(columns=LIST_STRING_COLS)),
    ('parse_json_strings', JSONStringParser(columns=JSON_STRING_COLS)),
    
    ('impute_features', ColumnTransformer(
        transformers=[
            ('num_impute', numeric_imputer, NUMERIC_IMPUTE_COLS),
            ('cat_impute', categorical_imputer, CATEGORICAL_IMPUTE_COLS)
        ],
        remainder='passthrough' # Keep all other columns
    ))
])

# Define the outlier remover separately
outlier_remover = CustomOutlierRemover(columns=OUTLIER_COLS, threshold=3)

# ==================================
# 4. EXECUTE THE PIPELINE
# ==================================

print("\n" + "="*30)
print("1. APPLYING MAIN CLEANING PIPELINE")
print("="*30)

# We assume 'df' is already loaded in memory
print(f"Shape before cleaning: {df.shape}")

# --- Reconstruct the DataFrame after pipeline ---

# Get the list of columns that remain *after* the ColumnTransformer
# 1. Get columns *before* the imputer step
temp_df = cleaning_pipeline.named_steps['parse_json_strings'].transform(
    cleaning_pipeline.named_steps['clean_list_strings'].transform(
        cleaning_pipeline.named_steps['clean_strings'].transform(
            cleaning_pipeline.named_steps['drop_cols'].transform(df)
        )
    )
)

# 2. Get the 'remainder' columns
imputed_cols = NUMERIC_IMPUTE_COLS + CATEGORICAL_IMPUTE_COLS
remainder_cols = [col for col in temp_df.columns if col not in imputed_cols]

# 3. Define the final column order
final_cols = NUMERIC_IMPUTE_COLS + CATEGORICAL_IMPUTE_COLS + remainder_cols

# 4. Apply the pipeline
df_cleaned_data = cleaning_pipeline.fit_transform(df)
df_cleaned = pd.DataFrame(df_cleaned_data, columns=final_cols)

# 5. Convert numeric columns back to numeric types
for col in NUMERIC_IMPUTE_COLS:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col])
# Convert isAdult back to numeric/int
if 'isAdult' in df_cleaned.columns:
    df_cleaned['isAdult'] = pd.to_numeric(df_cleaned['isAdult'])

print(f"Shape after cleaning: {df_cleaned.shape}")

# ==================================
# 5. ⭐️ SAVE THE CLEANED FILE FOR COLLEAGUES ⭐️
# ==================================
OUTPUT_FILE_PARQUET = 'imdb_cleaned_for_colleagues.parquet'
print(f"\nSaving cleaned data to {OUTPUT_FILE_PARQUET}...")
df_cleaned.to_parquet(OUTPUT_FILE_PARQUET, index=False)
print("✅ Save complete.")
# ==================================

print("\nCleaned Data Info:")
df_cleaned.info(memory_usage='deep')

print("\n" + "="*30)
print("2. APPLYING OUTLIER REMOVAL")
print("="*30)

df_no_outliers = outlier_remover.fit_transform(df_cleaned)

print(f"Shape before outlier removal: {df_cleaned.shape}")
print(f"Shape after outlier removal:  {df_no_outliers.shape}")
print(f"Removed {len(outlier_remover.outliers)} outlier rows.")

print("\n✅ Full cleaning pipeline complete! File saved for colleagues.")

# DUCK DUCK QUERY 

In [None]:
pip install duckdb tqdm


///////

In [None]:
# clean_imdb_duckdb.py
import duckdb
import os

# === CONFIG ===
INPUT_URL = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/imdb_merged_duckdb.parquet"  # can be local path or https URL (if accessible)
OUTPUT = "imdb_merged_cleaned_duckdb.parquet"
DROP_COLS = ["ordering_x", "ordering_y", "attributes", "job", "ordering_akas", "ordering_principal"]
CATEGORICAL_FILL = "unknown"

con = duckdb.connect()

# register parquet as a view
print("Registering parquet...")
con.execute(f"CREATE OR REPLACE VIEW imdb_raw AS SELECT * FROM read_parquet('{INPUT_URL}');")

# compute median(s) for numeric imputation (duckdb provides percentile_cont, use .5)
medians = {}
for col in ['averageRating', 'numVotes', 'seasonNumber', 'episodeNumber']:
    try:
        res = con.execute(f"SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY {col}) as med FROM imdb_raw WHERE {col} IS NOT NULL").fetchone()
        med = res[0] if res else 0
        medians[col] = med if med is not None else 0
    except Exception as e:
        medians[col] = 0

# compute outlier thresholds on numVotes
low_q, high_q = 0, 10**9
try:
    low, high = con.execute("SELECT percentile_cont(0.01) WITHIN GROUP (ORDER BY numVotes) as low, percentile_cont(0.999) WITHIN GROUP (ORDER BY numVotes) as high FROM imdb_raw WHERE numVotes IS NOT NULL").fetchone()
    if low is not None: low_q = low
    if high is not None: high_q = high
except Exception:
    pass

# Build a cleaned view with SQL transforms
# string normalization: lower(trim(...))
# list cleans: regexp_replace to normalize spaces around commas
# characters: remove [ ] and " characters via regexp_replace
# impute numerics with COALESCE(median)
# fill categoricals via COALESCE(col, 'unknown')

print("Creating cleaned view (this is a streaming SQL operation)...")
con.execute(f"""
CREATE OR REPLACE VIEW imdb_cleaned AS
SELECT
    -- drop columns by not selecting them; select explicitly every needed column or use * EXCEPT (duckdb 0.7+ supports EXCEPT)
    -- For brevity, use SELECT * EXCEPT(...) if supported; otherwise list columns you want.
    -- We'll use * EXCEPT to drop the unwanted cols
    * EXCEPT ({', '.join(DROP_COLS)})
FROM imdb_raw
""")

# Now do a second view that applies column-level transforms:
# Build transform SQL dynamically
transforms = []
# normalize string columns if they exist
for col in ['primaryTitle', 'primaryName', 'title', 'region', 'language']:
    transforms.append(f"CASE WHEN {col} IS NULL THEN '{CATEGORICAL_FILL}' ELSE lower(trim({col})) END as {col}")

# list-like columns
for col in ['genres','directors','writers']:
    transforms.append(f"CASE WHEN {col} IS NULL THEN '{CATEGORICAL_FILL}' ELSE lower(regexp_replace({col}, '\\\\s*,\\\\s*', ',', 'g')) END as {col}")

# characters JSON-clean
transforms.append("CASE WHEN characters IS NULL THEN '' ELSE lower(regexp_replace(regexp_replace(characters, '\\\\[|\\\\]|\"', '', 'g'), '\\\\s*,\\\\s*', ',', 'g')) END as characters")

# numerics imputation
for col, med in medians.items():
    transforms.append(f"COALESCE({col}, {med}) as {col}")

# years coercion (startYear) — try to cast numeric, else null
for col in ['startYear','birthYear','deathYear']:
    transforms.append(f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN NULL ELSE CAST({col} AS BIGINT) END as {col}")

# Now assemble final select — include all other columns unchanged by using imdb_cleaned.* and then overwrite with transformed ones via FROM ... SELECT
transforms_sql = ",\n    ".join(transforms)

final_sql = f"""
CREATE OR REPLACE VIEW imdb_cleaned_final AS
SELECT
    -- start with all original columns from imdb_cleaned (note: transformed columns will be overridden below)
    imdb_cleaned.*,
    {transforms_sql}
FROM imdb_cleaned
WHERE numVotes BETWEEN {int(low_q)} AND {int(high_q)}
;
"""
con.execute(final_sql)

# Export to parquet
print("Writing cleaned parquet to:", OUTPUT)
con.execute(f"COPY (SELECT * FROM imdb_cleaned_final) TO '{OUTPUT}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');")
print("Done. Output:", OUTPUT)

con.close()


In [None]:
# clean_imdb_duckdb_final_safe.py
import duckdb
import math

# CONFIG
INPUT_URL = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/imdb_merged_duckdb.parquet"  # can be local path or https URL (if accessible)   # local path or http(s) URL if accessible
OUTPUT = "imdb_merged_cleaned_duckdb.parquet"
DROP_COLS = {"ordering_x", "ordering_y", "attributes", "job", "ordering_akas", "ordering_principal"}
CATEGORICAL_FILL = "unknown"
NUMERIC_IMPUTE_COLS = ['averageRating', 'numVotes', 'seasonNumber', 'episodeNumber']
TRANSFORM_STRING_COLS = ['primaryTitle', 'primaryName', 'title', 'region', 'language']
LIST_COLS = ['genres', 'directors', 'writers']
CHAR_COL = 'characters'
YEAR_COLS = ['startYear', 'birthYear', 'deathYear']

# Helper: classifies DuckDB types simply
def is_numeric_type(duck_type: str):
    t = duck_type.lower()
    return any(x in t for x in ["tinyint","smallint","integer","int","bigint","decimal","numeric","float","double"])

def is_int_type(duck_type: str):
    t = duck_type.lower()
    return any(x in t for x in ["tinyint","smallint","integer","int","bigint"])

def is_string_type(duck_type: str):
    t = duck_type.lower()
    return any(x in t for x in ["varchar","text","string","char"])

con = duckdb.connect()

# 1) Register parquet as a view
con.execute(f"CREATE OR REPLACE VIEW imdb_raw AS SELECT * FROM read_parquet('{INPUT_URL}');")

# 2) Get actual column names and types
cols_info = con.execute("DESCRIBE imdb_raw").fetchall()  # returns list of (name, type, null?)
all_cols = [(r[0], r[1]) for r in cols_info]

# 3) compute medians for imputeable numeric columns using TRY_CAST
medians = {}
for col, coltype in all_cols:
    if col in NUMERIC_IMPUTE_COLS:
        if is_numeric_type(coltype):
            try:
                res = con.execute(
                    f"SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY TRY_CAST({col} AS DOUBLE)) FROM imdb_raw WHERE {col} IS NOT NULL AND {col} != '\\\\N'"
                ).fetchone()
                med = res[0] if res and res[0] is not None else 0
            except Exception:
                med = 0
        else:
            # if the declared type is string, try to compute median by casting values
            try:
                res = con.execute(
                    f"SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY TRY_CAST({col} AS DOUBLE)) FROM imdb_raw WHERE TRY_CAST({col} AS DOUBLE) IS NOT NULL"
                ).fetchone()
                med = res[0] if res and res[0] is not None else 0
            except Exception:
                med = 0
        if med is None or (isinstance(med, float) and (math.isnan(med) or math.isinf(med))):
            med = 0
        medians[col] = med

# 4) compute numVotes quantile bounds for outlier filtering using TRY_CAST
low_q, high_q = 0, 10**12
if any(c == 'numVotes' for c, _ in all_cols):
    try:
        low, high = con.execute(
            "SELECT percentile_cont(0.01) WITHIN GROUP (ORDER BY TRY_CAST(numVotes AS DOUBLE)), percentile_cont(0.999) WITHIN GROUP (ORDER BY TRY_CAST(numVotes AS DOUBLE)) FROM imdb_raw WHERE TRY_CAST(numVotes AS DOUBLE) IS NOT NULL"
        ).fetchone()
        if low is not None:
            low_q = int(max(0, math.floor(low)))
        if high is not None:
            high_q = int(math.ceil(high))
    except Exception:
        pass

# 5) Build safe SELECT expressions per column (explicit)
select_parts = []

for col, coltype in all_cols:
    if col in DROP_COLS:
        continue

    # prioritize explicit higher-level transforms
    if col in NUMERIC_IMPUTE_COLS:
        med = medians.get(col, 0)
        # Use TRY_CAST to produce numeric double, fallback to median
        select_parts.append(f"COALESCE(TRY_CAST({col} AS DOUBLE), {med}) AS {col}")
        continue

    if col in YEAR_COLS:
        # attempt integer cast, else NULL
        # If underlying type is numeric, just TRY_CAST to BIGINT; otherwise TRY_CAST string to BIGINT
        select_parts.append(f"TRY_CAST({col} AS BIGINT) AS {col}")
        continue

    if col in TRANSFORM_STRING_COLS:
        # string normalization: replace '\N' or NULL -> 'unknown', else lower(trim(...))
        # But if underlying declared type is numeric, be safe: TRY_CAST -> if numeric then cast to string via CAST(... AS VARCHAR)
        if is_string_type(coltype):
            select_parts.append(f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '{CATEGORICAL_FILL}' ELSE lower(trim({col})) END AS {col}")
        else:
            # declared numeric but you still want to present as string: convert safely
            select_parts.append(f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '{CATEGORICAL_FILL}' WHEN TRY_CAST({col} AS DOUBLE) IS NOT NULL THEN lower(trim(CAST(TRY_CAST({col} AS DOUBLE) AS VARCHAR))) ELSE '{CATEGORICAL_FILL}' END AS {col}")
        continue

    if col in LIST_COLS:
        # normalize commas and lowercase; treat '\N' as unknown
        if is_string_type(coltype):
            select_parts.append(
                f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '{CATEGORICAL_FILL}' ELSE lower(regexp_replace(regexp_replace({col}, '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) END AS {col}"
            )
        else:
            select_parts.append(
                f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '{CATEGORICAL_FILL}' WHEN TRY_CAST({col} AS DOUBLE) IS NOT NULL THEN lower(regexp_replace(regexp_replace(CAST(TRY_CAST({col} AS DOUBLE) AS VARCHAR), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) ELSE '{CATEGORICAL_FILL}' END AS {col}"
            )
        continue

    if col == CHAR_COL:
        if is_string_type(coltype):
            select_parts.append(
                f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '' ELSE lower(regexp_replace(regexp_replace(regexp_replace({col}, '\\\\[|\\\\]|\"', '', 'g'), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) END AS {col}"
            )
        else:
            select_parts.append(
                f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '' WHEN TRY_CAST({col} AS DOUBLE) IS NOT NULL THEN lower(regexp_replace(regexp_replace(regexp_replace(CAST(TRY_CAST({col} AS DOUBLE) AS VARCHAR), '\\\\[|\\\\]|\"', '', 'g'), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) ELSE '' END AS {col}"
            )
        continue

    # Default handling based on declared type:
    if is_numeric_type(coltype):
        # Ensure numeric output: TRY_CAST to DOUBLE (NULL if not castable)
        select_parts.append(f"TRY_CAST({col} AS DOUBLE) AS {col}")
    elif is_string_type(coltype):
        # Replace literal '\N' with NULL; keep string as-is (optionally trim/lower if you want)
        select_parts.append(f"CASE WHEN {col} = '\\\\N' THEN NULL ELSE {col} END AS {col}")
    else:
        # fallback: try to TRY_CAST to DOUBLE, else pass through with '\N' -> NULL guard
        select_parts.append(f"CASE WHEN {col} = '\\\\N' THEN NULL WHEN TRY_CAST({col} AS DOUBLE) IS NOT NULL THEN TRY_CAST({col} AS DOUBLE) ELSE {col} END AS {col}")

# 6) Assemble final SQL
final_select_sql = ",\n    ".join(select_parts)
where_clause = "1=1"
if any(c == 'numVotes' for c, _ in all_cols):
    # use TRY_CAST in where to avoid conversion errors
    where_clause = f"TRY_CAST(numVotes AS DOUBLE) BETWEEN {low_q} AND {high_q}"

create_view_sql = f"""
CREATE OR REPLACE VIEW imdb_cleaned_final AS
SELECT
    {final_select_sql}
FROM imdb_raw
WHERE {where_clause}
;
"""

# Optional: print a snippet for debugging
# print(create_view_sql[:2000])

# 7) Execute view creation & export
con.execute(create_view_sql)

print(f"Writing cleaned parquet to: {OUTPUT}")
# use COPY which streams
con.execute(f"COPY (SELECT * FROM imdb_cleaned_final) TO '{OUTPUT}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');")
print("Done. Output:", OUTPUT)

con.close()


In [None]:
import duckdb
import pandas as pd
import numpy as np
import json
import time
import os
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin

# ==================================
# 0. CONFIGURATION
# ==================================

# ⭐️ ASSUMPTION: This is the file you created with the *filtered merge* script
INPUT_FILE = "imdb_merged_duckdb_FILTERED.parquet" 

# This is the new, clean file we will create
OUTPUT_FILE = "imdb_cleaned_for_colleagues.parquet"

# ==================================
# PART 1: DUCKDB CLEANING PIPELINE
# ==================================

print(f"🦆 Starting DuckDB cleaning pipeline on '{INPUT_FILE}'...")
start_time = time.time()

# Connect to an in-memory database
con = duckdb.connect(database=':memory:')

# --- Register the raw Parquet file as a virtual table ---
try:
    con.execute(f"CREATE VIEW raw_data AS SELECT * FROM read_parquet('{INPUT_FILE}');")
except Exception as e:
    print(f"\n❌ ERROR: Could not read '{INPUT_FILE}'.")
    print("   Did you run the filtered merge script first? Is the file name correct?")
    print(f"   Details: {e}")
    exit()

print("1. Building DuckDB cleaning query...")

# --- Define Column Groups (for SQL) ---
# These lists help build the query
# Note: DROP_COLS are handled by not SELECT-ing them

CLEAN_STRING_COLS = ['primaryTitle', 'originalTitle', 'title_akas', 'primaryName']
NUMERIC_IMPUTE_COLS = ['startYear', 'endYear', 'runtimeMinutes', 'averageRating', 'numVotes', 'seasonNumber', 'episodeNumber', 'birthYear', 'deathYear']
CATEGORICAL_IMPUTE_COLS = ['titleType', 'isAdult', 'region', 'language', 'types', 'category']
LIST_STRING_COLS = ['genres', 'directors', 'writers', 'primaryProfession', 'knownForTitles']
JSON_STRING_COLS = ['characters']

# --- Build the Main Cleaning Query ---

# 1. Create the CTE for medians (for numeric imputation)
median_calculations = []
for col in NUMERIC_IMPUTE_COLS:
    median_calculations.append(f"median({col}) AS med_{col}")
median_cte = f"WITH NumericMedians AS (SELECT {', '.join(median_calculations)} FROM raw_data)"

# 2. Create the main SELECT statements
select_statements = [
    # --- Keep Key IDs ---
    "tconst",
    "nconst",
    "parentTconst", # From episode table
    "isOriginalTitle", # From akas table
]

# --- String Cleaning (LOWER, TRIM) ---
for col in CLEAN_STRING_COLS:
    select_statements.append(f"LOWER(TRIM({col})) AS {col}")

# --- List-String Cleaning (LOWER, TRIM, COALESCE) ---
for col in LIST_STRING_COLS:
    select_statements.append(f"LOWER(TRIM(COALESCE({col}, ''))) AS {col}")

# --- JSON-String Parsing (REGEXP_REPLACE) ---
for col in JSON_STRING_COLS:
    # This strips [" and "] characters, then trims and lowers
    select_statements.append(f"LOWER(TRIM(regexp_replace(COALESCE({col}, ''), '[\"\\[\\]]', '', 'g'))) AS {col}")

# --- Categorical Imputation (COALESCE 'unknown') ---
for col in CATEGORICAL_IMPUTE_COLS:
    select_statements.append(f"COALESCE(LOWER(TRIM({col})), 'unknown') AS {col}")
    
# --- Numeric Imputation (COALESCE with median) ---
for col in NUMERIC_IMPUTE_COLS:
    select_statements.append(f"COALESCE({col}, (SELECT med_{col} FROM NumericMedians)) AS {col}")

# --- Combine all parts into the final query ---
final_cleaning_query = f"""
{median_cte}
SELECT
    {', '.join(select_statements)}
FROM raw_data
"""

# 3. Execute the query and save the result
print(f"2. Running query and saving to '{OUTPUT_FILE}'...")
try:
    con.execute(f"""
    COPY (
        {final_cleaning_query}
    ) 
    TO '{OUTPUT_FILE}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');
    """)
except Exception as e:
    print(f"\n❌ ERROR: The DuckDB cleaning query failed.")
    print(f"   Details: {e}")
    con.close()
    exit()

con.close()
elapsed = (time.time() - start_time)
print(f"✅ DuckDB cleaning complete. File saved. (Elapsed: {elapsed:.2f}s)")


# ==================================
# PART 2: PANDAS OUTLIER REMOVAL
# ==================================

print("\n" + "="*40)
print(" PANDAS OUTLIER REMOVAL STAGE")
print("="*40)

# --- Load the NEW, CLEAN file we just saved ---
print(f"Loading '{OUTPUT_FILE}' into pandas...")
try:
    df_cleaned = pd.read_parquet(OUTPUT_FILE)
except FileNotFoundError:
    print(f"❌ ERROR: Cannot find '{OUTPUT_FILE}'. The cleaning script must have failed.")
    exit()

print("Cleaned Data Info:")
df_cleaned.info(memory_usage='deep')

# --- Define the Outlier Remover Class (Unchanged) ---
class CustomOutlierRemover(BaseEstimator, TransformerMixin):
    """Removes rows based on Z-score of specified numeric columns."""
    def __init__(self, columns, threshold=3):
        self.threshold = threshold
        self.columns = columns
        self._outliers = None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        valid_cols = [col for col in self.columns if col in X.columns and pd.api.types.is_numeric_dtype(X[col])]
        if not valid_cols:
            print("Warning: No valid numeric columns found for outlier removal.")
            return X_transformed
        z_scores = np.abs(stats.zscore(X_transformed[valid_cols], nan_policy='omit'))
        z_scores_filled = np.nan_to_num(z_scores, nan=0)
        mask = (z_scores_filled < self.threshold).all(axis=1)
        self._outliers = X_transformed[~mask]
        return X_transformed[mask]
    @property
    def outliers(self):
        return self._outliers

# --- Define Outlier Columns (Unchanged) ---
OUTLIER_COLS = [
    'runtimeMinutes',
    'numVotes',
    'averageRating',
    'startYear'
]

# --- Run the Outlier Remover ---
print("\nApplying outlier removal...")
outlier_remover = CustomOutlierRemover(columns=OUTLIER_COLS, threshold=3)
df_no_outliers = outlier_remover.fit_transform(df_cleaned)

print(f"Shape before outlier removal: {df_cleaned.shape}")
print(f"Shape after outlier removal:  {df_no_outliers.shape}")
print(f"Removed {len(outlier_remover.outliers)} outlier rows.")

print("\n✅ Full pipeline complete! File saved for colleagues.")

# Filtered DataSet

In [None]:
import duckdb
#main part
url = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/imdb_merged_duckdb.parquet"

con = duckdb.connect()

print("Loading data into pandas DataFrame using DuckDB...")

# ⭐️ This is the line you need to add:
df = con.execute(f"SELECT * FROM read_parquet('{url}')").df()

con.close()

print("✅ Data loaded successfully.")
print(df.head())
print(f"DataFrame shape: {df.shape}")

# We did it
I do it locally, with the usage of the terminal

In [None]:
# clean_imdb_duckdb_final_safe.py
import duckdb
import math

# CONFIG
INPUT_URL = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/imdb_merged_duckdb.parquet"  # can be local path or https URL (if accessible)   # local path or http(s) URL if accessible
OUTPUT = "imdb_merged_cleaned_duckdb.parquet"
DROP_COLS = {"ordering_x", "ordering_y", "attributes", "job", "ordering_akas", "ordering_principal"}
CATEGORICAL_FILL = "unknown"
NUMERIC_IMPUTE_COLS = ['averageRating', 'numVotes', 'seasonNumber', 'episodeNumber']
TRANSFORM_STRING_COLS = ['primaryTitle', 'primaryName', 'title', 'region', 'language']
LIST_COLS = ['genres', 'directors', 'writers']
CHAR_COL = 'characters'
YEAR_COLS = ['startYear', 'birthYear', 'deathYear']

# Helper: classifies DuckDB types simply
def is_numeric_type(duck_type: str):
    t = duck_type.lower()
    return any(x in t for x in ["tinyint","smallint","integer","int","bigint","decimal","numeric","float","double"])

def is_int_type(duck_type: str):
    t = duck_type.lower()
    return any(x in t for x in ["tinyint","smallint","integer","int","bigint"])

def is_string_type(duck_type: str):
    t = duck_type.lower()
    return any(x in t for x in ["varchar","text","string","char"])

con = duckdb.connect()

# 1) Register parquet as a view
con.execute(f"CREATE OR REPLACE VIEW imdb_raw AS SELECT * FROM read_parquet('{INPUT_URL}');")

# 2) Get actual column names and types
cols_info = con.execute("DESCRIBE imdb_raw").fetchall()  # returns list of (name, type, null?)
all_cols = [(r[0], r[1]) for r in cols_info]

# 3) compute medians for imputeable numeric columns using TRY_CAST
medians = {}
for col, coltype in all_cols:
    if col in NUMERIC_IMPUTE_COLS:
        if is_numeric_type(coltype):
            try:
                res = con.execute(
                    f"SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY TRY_CAST({col} AS DOUBLE)) FROM imdb_raw WHERE {col} IS NOT NULL AND {col} != '\\\\N'"
                ).fetchone()
                med = res[0] if res and res[0] is not None else 0
            except Exception:
                med = 0
        else:
            # if the declared type is string, try to compute median by casting values
            try:
                res = con.execute(
                    f"SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY TRY_CAST({col} AS DOUBLE)) FROM imdb_raw WHERE TRY_CAST({col} AS DOUBLE) IS NOT NULL"
                ).fetchone()
                med = res[0] if res and res[0] is not None else 0
            except Exception:
                med = 0
        if med is None or (isinstance(med, float) and (math.isnan(med) or math.isinf(med))):
            med = 0
        medians[col] = med

# 4) compute numVotes quantile bounds for outlier filtering using TRY_CAST
low_q, high_q = 0, 10**12
if any(c == 'numVotes' for c, _ in all_cols):
    try:
        low, high = con.execute(
            "SELECT percentile_cont(0.01) WITHIN GROUP (ORDER BY TRY_CAST(numVotes AS DOUBLE)), percentile_cont(0.999) WITHIN GROUP (ORDER BY TRY_CAST(numVotes AS DOUBLE)) FROM imdb_raw WHERE TRY_CAST(numVotes AS DOUBLE) IS NOT NULL"
        ).fetchone()
        if low is not None:
            low_q = int(max(0, math.floor(low)))
        if high is not None:
            high_q = int(math.ceil(high))
    except Exception:
        pass

# 5) Build safe SELECT expressions per column (explicit)
select_parts = []

for col, coltype in all_cols:
    if col in DROP_COLS:
        continue

    # prioritize explicit higher-level transforms
    if col in NUMERIC_IMPUTE_COLS:
        med = medians.get(col, 0)
        # Use TRY_CAST to produce numeric double, fallback to median
        select_parts.append(f"COALESCE(TRY_CAST({col} AS DOUBLE), {med}) AS {col}")
        continue

    if col in YEAR_COLS:
        # attempt integer cast, else NULL
        # If underlying type is numeric, just TRY_CAST to BIGINT; otherwise TRY_CAST string to BIGINT
        select_parts.append(f"TRY_CAST({col} AS BIGINT) AS {col}")
        continue

    if col in TRANSFORM_STRING_COLS:
        # string normalization: replace '\N' or NULL -> 'unknown', else lower(trim(...))
        # But if underlying declared type is numeric, be safe: TRY_CAST -> if numeric then cast to string via CAST(... AS VARCHAR)
        if is_string_type(coltype):
            select_parts.append(f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '{CATEGORICAL_FILL}' ELSE lower(trim({col})) END AS {col}")
        else:
            # declared numeric but you still want to present as string: convert safely
            select_parts.append(f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '{CATEGORICAL_FILL}' WHEN TRY_CAST({col} AS DOUBLE) IS NOT NULL THEN lower(trim(CAST(TRY_CAST({col} AS DOUBLE) AS VARCHAR))) ELSE '{CATEGORICAL_FILL}' END AS {col}")
        continue

    if col in LIST_COLS:
        # normalize commas and lowercase; treat '\N' as unknown
        if is_string_type(coltype):
            select_parts.append(
                f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '{CATEGORICAL_FILL}' ELSE lower(regexp_replace(regexp_replace({col}, '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) END AS {col}"
            )
        else:
            select_parts.append(
                f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '{CATEGORICAL_FILL}' WHEN TRY_CAST({col} AS DOUBLE) IS NOT NULL THEN lower(regexp_replace(regexp_replace(CAST(TRY_CAST({col} AS DOUBLE) AS VARCHAR), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) ELSE '{CATEGORICAL_FILL}' END AS {col}"
            )
        continue

    if col == CHAR_COL:
        if is_string_type(coltype):
            select_parts.append(
                f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '' ELSE lower(regexp_replace(regexp_replace(regexp_replace({col}, '\\\\[|\\\\]|\"', '', 'g'), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) END AS {col}"
            )
        else:
            select_parts.append(
                f"CASE WHEN {col} IS NULL OR {col} = '\\\\N' THEN '' WHEN TRY_CAST({col} AS DOUBLE) IS NOT NULL THEN lower(regexp_replace(regexp_replace(regexp_replace(CAST(TRY_CAST({col} AS DOUBLE) AS VARCHAR), '\\\\[|\\\\]|\"', '', 'g'), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) ELSE '' END AS {col}"
            )
        continue

    # Default handling based on declared type:
    if is_numeric_type(coltype):
        # Ensure numeric output: TRY_CAST to DOUBLE (NULL if not castable)
        select_parts.append(f"TRY_CAST({col} AS DOUBLE) AS {col}")
    elif is_string_type(coltype):
        # Replace literal '\N' with NULL; keep string as-is (optionally trim/lower if you want)
        select_parts.append(f"CASE WHEN {col} = '\\\\N' THEN NULL ELSE {col} END AS {col}")
    else:
        # fallback: try to TRY_CAST to DOUBLE, else pass through with '\N' -> NULL guard
        select_parts.append(f"CASE WHEN {col} = '\\\\N' THEN NULL WHEN TRY_CAST({col} AS DOUBLE) IS NOT NULL THEN TRY_CAST({col} AS DOUBLE) ELSE {col} END AS {col}")

# 6) Assemble final SQL
final_select_sql = ",\n    ".join(select_parts)
where_clause = "1=1"
if any(c == 'numVotes' for c, _ in all_cols):
    # use TRY_CAST in where to avoid conversion errors
    where_clause = f"TRY_CAST(numVotes AS DOUBLE) BETWEEN {low_q} AND {high_q}"

create_view_sql = f"""
CREATE OR REPLACE VIEW imdb_cleaned_final AS
SELECT
    {final_select_sql}
FROM imdb_raw
WHERE {where_clause}
;
"""

# Optional: print a snippet for debugging
# print(create_view_sql[:2000])

# 7) Execute view creation & export
con.execute(create_view_sql)

print(f"Writing cleaned parquet to: {OUTPUT}")
# use COPY which streams
con.execute(f"COPY (SELECT * FROM imdb_cleaned_final) TO '{OUTPUT}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');")
print("Done. Output:", OUTPUT)

con.close()


In [None]:
import duckdb

# The URL to your cleaned file
url = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-26_112727_UTC/imdb_merged_cleaned_duckdb.parquet"

# Connect to an in-memory database
con = duckdb.connect(database=':memory:')

# 1. Install and load the httpfs extension
# This is required to read files from URLs
print("Loading httpfs extension...")
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")

# 2. Create a VIEW (this is instant and uses no memory)
# This just tells DuckDB where the file is. It doesn't download it.
print(f"Registering URL as 'imdb_cleaned' view...")
con.execute(f"CREATE OR REPLACE VIEW imdb_cleaned AS SELECT * FROM '{url}'")

print("\n✅ Done! The view 'imdb_cleaned' is ready to be queried.")

# ==========================================================
#  NOW YOU CAN QUERY IT SAFELY:
# ==========================================================

# Example 1: Get 10 rows to see the columns
print("\n--- Example 1: Grabbing 10 rows ---")
df_sample = con.sql("SELECT * FROM imdb_cleaned LIMIT 10").df()
print(df_sample)


# Example 2: Run an aggregation
# DuckDB does the heavy work, you just get the small result.
print("\n--- Example 2: Running a safe aggregation ---")
query = """
SELECT 
    titleType, 
    COUNT(*) as total_rows
FROM imdb_cleaned 
GROUP BY titleType
"""
df_agg = con.sql(query).df()

print(df_agg)

# You can keep using 'con' to run any query you want on the 'imdb_cleaned' view
# con.close() # Close it when you're all done

# Checking how our cleaned data looks like on a sample cleaned data

In [1]:
import pandas as pd

url = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-26_114928_UTC/imdb_merged_cleaned_duckdb.parquet"
df = pd.read_parquet(url, engine="pyarrow")

print(df.head())
print('\n', df.info())


       tconst  titleType          primaryTitle       originalTitle  isAdult  \
0   tt0207871   tvSeries             buccaneer           Buccaneer      0.0   
1   tt0118694      movie  in the mood for love    Fa yeung nin wah      0.0   
2  tt13586826  tvEpisode    allumer le camping  Allumer le Camping      0.0   
3   tt2226407      movie         the landlords     Padroni di casa      0.0   
4  tt28636869  tvEpisode          episode #2.4        Episode #2.4      0.0   

   startYear endYear  runtimeMinutes                  genres  averageRating  \
0     1980.0    None            50.0         adventure,drama            6.8   
1     2000.0    None            98.0           drama,romance            8.0   
2     2020.0    None            45.0                  comedy            5.4   
3     2012.0    None            90.0                   drama            6.2   
4     2023.0    None             NaN  action,adventure,drama            8.0   

   ...  ordering_1     nconst  category           