# DATA LOADING + DATA CLEANING 

In [None]:
import duckdb
import time
from tqdm import tqdm

# =======================
#  Azure URLs
# =======================
URL_NAME_BASICS    = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104122_UTC/name.basics.tsv.gz"
URL_TITLE_AKAS     = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104546_UTC/title.akas.tsv.gz"
URL_TITLE_BASICS   = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104810_UTC/title.basics.tsv.gz"
URL_TITLE_CREW     = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_104937_UTC/title.crew.tsv.gz"
URL_TITLE_EPISODE  = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105103_UTC/title.episode.tsv.gz"
URL_TITLE_PRINC    = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105225_UTC/title.principals.tsv.gz"
URL_TITLE_RATINGS  = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/title.ratings.tsv.gz"

# =======================
#  Start DuckDB connection
# =======================
# Connect to an in-memory database or specify a file: database='imdb_merge.duckdb'
con = duckdb.connect(database=':memory:')

start_time = time.time()
print(" Starting IMDb data merge using DuckDB...\n")

# =======================
#  Stage progress tracker
# =======================
stages = [
    "Registering IMDb files",
    "Joining basics + ratings + crew + episode",
    "Joining akas",
    "Joining principals",
    "Joining name.basics",
    "Exporting to Parquet"
]
progress = tqdm(total=len(stages), desc="Progress", ncols=80, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')

# =======================
#  Register TSV.GZ files as virtual tables
# =======================
# DuckDB can read directly from HTTPS URLs and handle compressed files.
# 'auto_detect=True' helps with schema, but we specify key params.
base_read_csv = "SELECT * FROM read_csv('{}', delim='\\t', nullstr='\\\\N', header=True, compression='gzip', auto_detect=True, parallel=True)"

con.execute(f"CREATE OR REPLACE VIEW name_basics AS {base_read_csv.format(URL_NAME_BASICS)};")
con.execute(f"CREATE OR REPLACE VIEW title_basics AS {base_read_csv.format(URL_TITLE_BASICS)};")
con.execute(f"CREATE OR REPLACE VIEW title_ratings AS {base_read_csv.format(URL_TITLE_RATINGS)};")
con.execute(f"CREATE OR REPLACE VIEW title_crew AS {base_read_csv.format(URL_TITLE_CREW)};")
con.execute(f"CREATE OR REPLACE VIEW title_episode AS {base_read_csv.format(URL_TITLE_EPISODE)};")
con.execute(f"CREATE OR REPLACE VIEW title_akas AS {base_read_csv.format(URL_TITLE_AKAS)};")
con.execute(f"CREATE OR REPLACE VIEW title_principals AS {base_read_csv.format(URL_TITLE_PRINC)};")

progress.update(1)
progress.set_description(stages[1])

# =======================
#  Perform joins step by step
# =======================
# This step-by-step materialization helps manage memory
con.execute("""
CREATE OR REPLACE TABLE merged_core AS
SELECT *
FROM title_basics b
LEFT JOIN title_ratings r USING (tconst)
LEFT JOIN title_crew c USING (tconst)
LEFT JOIN title_episode e USING (tconst);
""")
progress.update(1)
progress.set_description(stages[2])

con.execute("""
CREATE OR REPLACE TABLE merged_with_akas AS
SELECT *
FROM merged_core mc
LEFT JOIN title_akas a ON mc.tconst = a.titleId;
""")
progress.update(1)
progress.set_description(stages[3])

con.execute("""
CREATE OR REPLACE TABLE merged_with_principals AS
SELECT *
FROM merged_with_akas ma
LEFT JOIN title_principals p ON ma.tconst = p.tconst;
""")
progress.update(1)
progress.set_description(stages[4])

con.execute("""
CREATE OR REPLACE TABLE imdb_final AS
SELECT *
FROM merged_with_principals mp
LEFT JOIN name_basics n ON mp.nconst = n.nconst;
""")
progress.update(1)
progress.set_description(stages[5])

# =======================
#  Export to Parquet
# =======================
con.execute("""
COPY (SELECT * FROM imdb_final) 
TO 'imdb_merged_duckdb.parquet' (FORMAT PARQUET, COMPRESSION 'SNAPPY', ROW_GROUP_SIZE 100000);
""")
progress.update(1)
progress.close()

# =======================
#  Clean up
# =======================
con.close()
elapsed = (time.time() - start_time)
print(f"\n✅ Done! Merged dataset saved as imdb_merged_duckdb.parquet (Elapsed: {elapsed:.2f} seconds)")
print("   Reload it fast with: pd.read_parquet('imdb_merged_duckdb.parquet')")

# Brief Summary of Our Whole Process of Merging IMDb Datasets

## 1. GOAL
Merge all datasets into one master table, so that:
- Each row contains all information for a title (or exploded rows for multiple actors or alternate titles).
- Later, We can clean and analyze the data.

## 2. CHALLENGES
- Some files (`title.akas` and `title.principals`) are very large (5–10 GB).
- Merging everything at once may use too much RAM and crash the computer.
- Many rows in `akas` and `principals` may not be relevant to the titles in the main dataset.

## 3. APPROACH (CHUNKED MERGE)
1. Load small datasets into memory first: `title.basics`, `title.ratings`, `title.crew`, `title.episode`, `name.basics`.
2. Merge small datasets on `tconst` → master table with one row per title.
3. Build a set of relevant titles (`tconst_set`) for fast filtering.
4. Process big datasets (`title.akas` and `title.principals`) in chunks (~300k rows), filter by `tconst_set`, then merge.
5. Merge `name.basics` to attach actor/director info using `nconst`.
6. Save final dataset (Parquet recommended, CSV optional).

## 4. HOW MERGING WORKS
- `df_left.merge(df_right, left_on="colA", right_on="colB", how="left")`:
  - Left table = main/master table.
  - Right table = extra info (ratings, actors, alternate titles).
  - `how="left"` keeps all rows from the left table; missing matches → NaN.

## 5. ADVANTAGES OF CHUNKED MERGE
- Memory-efficient – never load huge tables fully.
- Fast filtering – only process rows relevant to `tconst_set`.
- Safe – reduces risk of crashing.
- Complete – preserves all relevant info, including multiple actors and alternate titles.

## 6. NOTES AFTER MERGE
- Multiple rows per title are normal:
  - One row per actor (`title.principals`).
  - One row per alternate title (`title.akas`).
- Later during data cleaning, you can:
  - Deduplicate rows.
  - Aggregate actors or alternate titles into lists.
  - Normalize genres or other fields.

# First steps on Data Exploration

In [None]:
import pandas as pd
import numpy as np
import json
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
# Load the dataset you created in the first script
try:
    df_raw = pd.read_parquet('imdb_merged_chunked.parquet')
except FileNotFoundError:
    print("Error: 'imdb_merged_chunked.parquet' not found.")
    print("Please run your 'imdb_chunked_merge_azure.py' script first.")
    # As a fallback, create a dummy dataframe for the script to run
    df_raw = pd.DataFrame({
        'tconst': ['tt000001'], 'primaryTitle': ['Movie Title'], 'startYear': [2000],
        'averageRating': [8.0], 'numVotes': [100], 'genres': ['Action,Drama'],
        'directors': ['nm000001'], 'writers': ['nm000002'], 'parentTconst': [np.nan],
        'seasonNumber': [np.nan], 'episodeNumber': [np.nan], 'titleId': ['tt000001'],
        'ordering_x': [1], 'title': ['movie title'], 'region': ['US'], 'language': ['en'],
        'isOriginalTitle': [0], 'ordering_y': [1], 'nconst': ['nm000003'],
        'category': ['actor'], 'job': [np.nan], 'characters': ['["Lead Role"]'],
        'primaryName': ['Actor Name'], 'birthYear': [1980], 'deathYear': [np.nan]
    })

print("="*30)
print("1. DATA INFO")
print("="*30)
# Use verbose=True to see all columns
df_raw.info(verbose=True, memory_usage='deep')

print("\n" + "="*30)
print("2. MISSING VALUES (Top 20)")
print("="*30)
missing_values = df_raw.isnull().sum()
missing_percent = (missing_values / len(df_raw) * 100).round(2)
missing_df = pd.DataFrame({'count': missing_values, 'percent': missing_percent})
print(missing_df[missing_df['count'] > 0].sort_values(by='count', ascending=False).head(20))

print("\n" + "="*30)
print("3. DUPLICATE ROWS")
print("="*30)
num_dupes = df_raw.duplicated().sum()
print(f"Number of duplicate rows: {num_dupes}")
# Note: Duplicates might be expected if a title has multiple actors/akas

print("\n" + "="*30)
print("4. CATEGORICAL CARDINALITY (Top 20)")
print("="*30)
categorical_cols = df_raw.select_dtypes(include=['object', 'category']).columns
cardinality = {col: df_raw[col].nunique() for col in categorical_cols}
print("Number of unique values in categorical columns:")
# Sort by cardinality (highest first) to find problematic columns
sorted_cardinality = sorted(cardinality.items(), key=lambda x: x[1], reverse=True)
for col, count in sorted_cardinality[:20]:
    print(f"{col}: {count}")

print("\n" + "="*30)
print("5. NUMERIC DISTRIBUTION")
print("="*30)
# This helps spot outliers and skew
numeric_cols = df_raw.select_dtypes(include=np.number).columns
print(df_raw[numeric_cols].describe().T)

# Key transformation
1. Drop Columns (DropColumnTransformer)
- t removes ordering_akas, ordering_principal, attributes, and job
- These are redundant keys or ordering columns from the original files that are not needed after merging on tconst and nconst. They add noise and increase memory usage

2. Standardize Strings (StringCleaner)
- Lowercases and strips whitespace from free-text columns like primaryTitle and primaryName
- Ensures consistency. "The Matrix", " the matrix ", and "the matrix" become identical ("the matrix"), which prevents the model from treating them as different categories.

3. Clean List-Strings (ListStringCleaner)
- Handles comma-separated columns like genres and directors. It turns 'Action, Adventure' into 'action,adventure'
- This is crucial for feature engineering.

4. Parse JSON-Strings (JSONStringParser)
- Specifically targets the characters column, which is often a messy string like '["Walter White", "Heisenberg"]'. It parses this into a clean, comma-separated string: 'walter white,heisenberg'
- This "unpacks" the valuable information (who an actor played) from a format that is otherwise unusable.

5. Impute Missing Values (SimpleImputer)
* Fills in missing data.
- For numeric our strategy is "median", so we fill missing values with median 
- For categorical ones our strategy is "constant", we fill each null value with string "unknown"

!!!CustomOutlierRemoval is included, but separated from the main pipeline.

# Cleaning done with pandas (initial approach)

In [None]:
import pandas as pd
import numpy as np
import json
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import warnings

# Suppress warnings for cleaner output
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore', category=FutureWarning)

# ==================================
# 1. DEFINE CUSTOM TRANSFORMERS
# ==================================

class DropColumnTransformer(BaseEstimator, TransformerMixin):
    """Drops specified columns."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns=self.columns, axis=1, errors='ignore')

class StringCleaner(BaseEstimator, TransformerMixin):
    """Strips whitespace, lowercases, and converts to string."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].astype(str).str.strip().str.lower()
        return X_transformed

class ListStringCleaner(BaseEstimator, TransformerMixin):
    """Cleans comma-separated list-strings (e.g., genres)."""
    def __init__(self, columns, separator=','):
        self.columns = columns
        self.separator = separator
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna('')
                X_transformed[col] = X_transformed[col].apply(
                    lambda s: self.separator.join(
                        [item.strip().lower() for item in str(s).split(self.separator)]
                    ) if s else ''
                )
        return X_transformed

class JSONStringParser(BaseEstimator, TransformerMixin):
    """Parses columns containing string representations of JSON lists."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def _parse(self, item):
        if pd.isna(item):
            return ''  # ⭐️⭐️⭐️ THE FIX IS HERE ⭐️⭐️⭐️ (Was '[]', now '""')
        try:
            parsed_list = json.loads(item)
            if isinstance(parsed_list, list):
                return ','.join([str(i).strip().lower() for i in parsed_list])
            return ''
        except (json.JSONDecodeError, TypeError, SyntaxError):
            # Handle cases where it's not a valid JSON list (e.g., just "ActorName")
            return str(item).strip().lower()
            
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].apply(self._parse)
        return X_transformed

class CustomOutlierRemover(BaseEstimator, TransformerMixin):
    """Removes rows based on Z-score of specified numeric columns."""
    def __init__(self, columns, threshold=3):
        self.threshold = threshold
        self.columns = columns
        self._outliers = None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        valid_cols = [col for col in self.columns if col in X.columns and pd.api.types.is_numeric_dtype(X[col])]
        if not valid_cols:
            print("Warning: No valid numeric columns found for outlier removal.")
            return X_transformed
        
        # Ensure NaNs are handled before zscore
        z_scores = np.abs(stats.zscore(X_transformed[valid_cols], nan_policy='omit'))
        
        # Fill NaNs in z_scores with 0 (so they aren't considered outliers)
        z_scores_filled = np.nan_to_num(z_scores, nan=0)
        
        mask = (z_scores_filled < self.threshold).all(axis=1)
        self._outliers = X_transformed[~mask]
        return X_transformed[mask]
    @property
    def outliers(self):
        return self._outliers

# ==================================
# 2. DEFINE COLUMN GROUPS
# ==================================

# Columns to remove: Redundant IDs, low-value, or messy
DROP_COLS = [
    'ordering_akas', 
    'ordering_principal',
    'attributes',
    'job'
]

# Simple strings to clean (lowercase, strip)
CLEAN_STRING_COLS = [
    'primaryTitle',
    'originalTitle',
    'title_akas', # Renamed from 'title'
    'primaryName'
]

# Numeric columns to impute (with median)
NUMERIC_IMPUTE_COLS = [
    'startYear',
    'endYear',
    'runtimeMinutes',
    'averageRating',
    'numVotes',
    'seasonNumber',
    'episodeNumber',
    'birthYear',
    'deathYear'
]

# Categorical columns to impute (with 'unknown')
CATEGORICAL_IMPUTE_COLS = [
    'titleType',
    'isAdult',
    'region',
    'language',
    'types',
    'category'
]

# Comma-separated list-strings
LIST_STRING_COLS = [
    'genres',
    'directors',
    'writers',
    'primaryProfession',
    'knownForTitles'
]

# JSON-like list-strings
JSON_STRING_COLS = [
    'characters'
]

# Numeric columns to check for outliers
OUTLIER_COLS = [
    'runtimeMinutes',
    'numVotes',
    'averageRating',
    'startYear'
]

# ==================================
# 3. BUILD THE CLEANING PIPELINE
# ==================================

print("Building cleaning pipeline...")

# Define imputers
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='constant', fill_value='unknown')

# Create the main cleaning pipeline
cleaning_pipeline = Pipeline(steps=[
    ('drop_cols', DropColumnTransformer(columns=DROP_COLS)),
    ('clean_strings', StringCleaner(columns=CLEAN_STRING_COLS)),
    ('clean_list_strings', ListStringCleaner(columns=LIST_STRING_COLS)),
    ('parse_json_strings', JSONStringParser(columns=JSON_STRING_COLS)),
    
    ('impute_features', ColumnTransformer(
        transformers=[
            ('num_impute', numeric_imputer, NUMERIC_IMPUTE_COLS),
            ('cat_impute', categorical_imputer, CATEGORICAL_IMPUTE_COLS)
        ],
        remainder='passthrough' # Keep all other columns
    ))
])

# Define the outlier remover separately
outlier_remover = CustomOutlierRemover(columns=OUTLIER_COLS, threshold=3)

# ==================================
# 4. EXECUTE THE PIPELINE
# ==================================

print("\n" + "="*30)
print("1. APPLYING MAIN CLEANING PIPELINE")
print("="*30)

# We assume 'df' is already loaded in memory
print(f"Shape before cleaning: {df.shape}")

# --- Reconstruct the DataFrame after pipeline ---

# Get the list of columns that remain *after* the ColumnTransformer
# 1. Get columns *before* the imputer step
temp_df = cleaning_pipeline.named_steps['parse_json_strings'].transform(
    cleaning_pipeline.named_steps['clean_list_strings'].transform(
        cleaning_pipeline.named_steps['clean_strings'].transform(
            cleaning_pipeline.named_steps['drop_cols'].transform(df)
        )
    )
)

# 2. Get the 'remainder' columns
imputed_cols = NUMERIC_IMPUTE_COLS + CATEGORICAL_IMPUTE_COLS
remainder_cols = [col for col in temp_df.columns if col not in imputed_cols]

# 3. Define the final column order
final_cols = NUMERIC_IMPUTE_COLS + CATEGORICAL_IMPUTE_COLS + remainder_cols

# 4. Apply the pipeline
df_cleaned_data = cleaning_pipeline.fit_transform(df)
df_cleaned = pd.DataFrame(df_cleaned_data, columns=final_cols)

# 5. Convert numeric columns back to numeric types
for col in NUMERIC_IMPUTE_COLS:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col])
# Convert isAdult back to numeric/int
if 'isAdult' in df_cleaned.columns:
    df_cleaned['isAdult'] = pd.to_numeric(df_cleaned['isAdult'])

print(f"Shape after cleaning: {df_cleaned.shape}")

# ==================================
# 5. ⭐️ SAVE THE CLEANED FILE FOR COLLEAGUES ⭐️
# ==================================
OUTPUT_FILE_PARQUET = 'imdb_cleaned_for_colleagues.parquet'
print(f"\nSaving cleaned data to {OUTPUT_FILE_PARQUET}...")
df_cleaned.to_parquet(OUTPUT_FILE_PARQUET, index=False)
print("✅ Save complete.")
# ==================================

print("\nCleaned Data Info:")
df_cleaned.info(memory_usage='deep')

print("\n" + "="*30)
print("2. APPLYING OUTLIER REMOVAL")
print("="*30)

df_no_outliers = outlier_remover.fit_transform(df_cleaned)

print(f"Shape before outlier removal: {df_cleaned.shape}")
print(f"Shape after outlier removal:  {df_no_outliers.shape}")
print(f"Removed {len(outlier_remover.outliers)} outlier rows.")

print("\n✅ Full cleaning pipeline complete! File saved for colleagues.")

# Filtered DataSet

In [None]:
import duckdb
#main part
url = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/imdb_merged_duckdb.parquet"

con = duckdb.connect()

print("Loading data into pandas DataFrame using DuckDB...")

# ⭐️ This is the line you need to add:
df = con.execute(f"SELECT * FROM read_parquet('{url}')").df()

con.close()

print("✅ Data loaded successfully.")
print(df.head())
print(f"DataFrame shape: {df.shape}")

# Code for the general overview

In [11]:
import duckdb
import pandas as pd

con = duckdb.connect()

# 1️⃣ Register your parquet file as a view
PARQUET_URL = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/imdb_merged_duckdb.parquet"
TABLE = "imdb_raw"

con.execute(f"CREATE OR REPLACE VIEW {TABLE} AS SELECT * FROM read_parquet('{PARQUET_URL}');")

# 2️⃣ Get all columns & types
cols_info = con.execute(f"DESCRIBE {TABLE};").fetchdf()

stats_list = []

# 3️⃣ Loop through columns and compute stats
for _, row in cols_info.iterrows():
    col = row["column_name"]
    coltype = row["column_type"].lower()

    if any(t in coltype for t in ["int", "double", "decimal", "float", "numeric"]):
        q = f"""
        SELECT
            '{col}' AS column,
            '{coltype}' AS type,
            COUNT(*) AS total,
            SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) AS nulls,
            ROUND(100.0 * SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS null_pct,
            MIN({col}) AS min_val,
            MAX({col}) AS max_val,
            AVG({col}) AS mean_val,
            MEDIAN({col}) AS median_val,
            COUNT(DISTINCT {col}) AS distinct_vals
        FROM {TABLE};
        """
    else:
        q = f"""
        SELECT
            '{col}' AS column,
            '{coltype}' AS type,
            COUNT(*) AS total,
            SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) AS nulls,
            ROUND(100.0 * SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS null_pct,
            NULL AS min_val,
            NULL AS max_val,
            NULL AS mean_val,
            NULL AS median_val,
            COUNT(DISTINCT {col}) AS distinct_vals
        FROM {TABLE};
        """

    stats_list.append(con.execute(q).fetchdf())

# 4️⃣ Concatenate results into a single DataFrame
overview = pd.concat(stats_list, ignore_index=True)

# 5️⃣ Add sample values (a few examples from each column)
sample_values = {}
for col in cols_info["column_name"]:
    try:
        val = con.execute(f"SELECT {col} FROM {TABLE} WHERE {col} IS NOT NULL LIMIT 3;").fetchdf()
        sample_values[col] = ', '.join(map(str, val[col].tolist()))
    except Exception:
        sample_values[col] = "error"

overview["examples"] = overview["column"].map(sample_values)

# 6️⃣ Display neatly
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

print(overview)

# Optional: Save to file for later analysis
overview.to_parquet("imdb_column_overview.parquet", index=False)


               column     type      total        nulls  null_pct  min_val  \
0              tconst  varchar  465811724          0.0      0.00      NaN   
1           titleType  varchar  465811724          0.0      0.00      NaN   
2        primaryTitle  varchar  465811724          0.0      0.00      NaN   
3       originalTitle  varchar  465811724          0.0      0.00      NaN   
4             isAdult   bigint  465811724          0.0      0.00      0.0   
5           startYear   bigint  465811724   65894855.0     14.15   1874.0   
6             endYear  varchar  465811724  458488696.0     98.43      NaN   
7      runtimeMinutes   bigint  465811724  269538145.0     57.86      0.0   
8              genres  varchar  465811724   16246770.0      3.49      NaN   
9       averageRating   double  465811724  367425094.0     78.88      1.0   
10           numVotes   bigint  465811724  367425094.0     78.88      5.0   
11          directors  varchar  465811724  102012828.0     21.90      NaN   

# IMDB merged dataset — column-by-column review & proposed actions


| Column | Current null % | Potential decision | Rationale |
|---|---:|---|---|
| `tconst` | 0.00% | **KEEP (id)** | Primary id — keep. |
| `titleType` | 0.00% | **KEEP (as categorical)** | Useful metadata. Fill `'unknown'` if missing (none). |
| `primaryTitle` / `originalTitle` | 0.00% | **KEEP** | Clean strings (lower/strip). |
| `isAdult` | 0.00% | **KEEP** | convert to 0/1 integer. |
| `startYear` | 14.15% | **KEEP + IMPUTE (median)** | Year missing ~14% — impute median (or mode) if you need no nulls. If time-series needed, consider keeping NULLs. |
| `endYear` | 98.43% | **DROP** ✅ | Vast majority missing — not useful. |
| `runtimeMinutes` | 57.86% | **KEEP + INVESTIGATE** (recommended: median imputation) | Very many missing; check distribution. If you need runtime, impute median; else consider dropping if mostly missing for your use-case. |
| `genres` | 3.49% | **KEEP** | Normalize lists; fill `'unknown'` when missing. |
| `averageRating` | 78.88% | **DROP** (recommended) or **KEEP+IMPUTE(median)** (alternative) | High missingness — dropping is defensible. If rating is core to your analysis, keep and impute median but be aware of bias. |
| `numVotes` | 78.88% | **DROP** (recommended) or **KEEP+IMPUTE(median)** | Highly skewed and mostly missing; if you need it, consider log-transform + median impute. |
| `directors` / `writers` | 21.90% / 24.19% | **KEEP** | Normalize lists, fill `'unknown'`. |
| `parentTconst` | 21.72% | **KEEP (fill 'unknown')** | Might be useful for episodes; fill unknown if you must remove nulls. |
| `seasonNumber` / `episodeNumber` | 29.32% | **KEEP (fill 0 or median)** | For non-episodic rows these are naturally NULL. Fill with 0 if you prefer no nulls. |
| `titleId` / `ordering` / `title` (akas) | very small | **KEEP** | Keep/clean. |
| `region` / `language` / `types` | 20–66% | **KEEP + CLEAN** | For `types` large missingness — check values; fill `'unknown'`. |
| `attributes` | 99.12% | **DROP** ✅ | Nearly entirely empty — drop. |
| `isOriginalTitle` | 0.01% | **KEEP** | Clean/convert. |
| `nconst`, `nconst_1`, `tconst_1`, `ordering_1` | small nulls | **KEEP** | Fill `'unknown'` for ids or 0 for ordering if necessary. |
| `category` | 0.88% | **KEEP** | Fill `'unknown'`. |
| `job` | 81.51% | **DROP** ✅ | Very sparse — drop unless you need it. |
| `characters` | 52.13% | **KEEP** (clean & empty-string default) | Strip JSON-like brackets and quotes; keep as CSV string, fill `''` if missing. |
| `primaryName` / `birthYear` / `deathYear` | birthYear 57.31% / deathYear 87.52% | **birthYear: keep+impute; deathYear: DROP** | deathYear mostly missing — drop. BirthYear could be useful (impute median or keep NULLs). |
| `primaryProfession` | 4.20% | **KEEP** | Fill `'unknown'`. |
| `knownForTitles` | 0.96% | **KEEP** | Fill `'unknown'` if missing. |

---

## My recommended action (DEFAULT)
- **Drop**: `endYear`, `deathYear`, `attributes`, `job`.  
- **Drop or keep w/ caution** (you choose): `averageRating`, `numVotes` — recommended to drop because ~79% missing; if ratings are central to your analysis, keep and **impute median** (not mean).  
- **Impute numeric columns with median** (if you want ZERO NULLS): `startYear`, `runtimeMinutes`, `seasonNumber`, `episodeNumber`, `birthYear` (choose median or domain default).  
- **Impute strings** with `'unknown'`.  
- **characters** → clean JSON-like strings and default to `''` if missing.  
- After you accept/reject these, run the code cell to create a final cleaned parquet and an overview report.

---

If you accept the DEFAULT plan above, run the code cell that follows. If you want different choices for `averageRating` / `numVotes` / `runtimeMinutes` (e.g., keep-and-impute vs drop), edit the code comments before running.


### 🔍 Rethinking the Null-Handling Strategy (Second Review)

After reviewing the column-wise null percentages and my initial cleaning plan, I realized that some of the earlier imputation choices might not make much sense for how the IMDb dataset is structured. Below is a more careful and logical re-evaluation of each case:

---

#### 🎬 `startYear`  
If the dataset is at least somewhat chronologically ordered, taking the **previous non-null value** might actually preserve the temporal continuity between nearby entries — which could make more sense than throwing in a random median year.  
However, this assumption depends on whether the data is sorted by title or ID, so this needs to be tested before implementation.

---

#### ⏱️ `runtimeMinutes`
Here we have around **60% nulls**, so filling them with a median value could seriously distort the data distribution.  
A better approach might be:
- either **set nulls to 0**, indicating “unknown runtime,”  
- or introduce a **new category like `'unknown_runtime'`** if we later decide to treat it as categorical.  
For now, setting it to `0` (meaning *missing or undefined duration*) feels more realistic.

---

#### 📺 `seasonNumber` and `episodeNumber`
Median imputation doesn’t make any sense here because the majority of titles are just standalone movies — not part of any TV show.  
Using median would just create weird pseudo-seasons.  
So the cleanest fix is to **set missing values to `0`**, clearly meaning “not applicable.”

---

#### 👶 `birthYear`
About **57% nulls**, which is quite a lot.  
The median year might not represent anything meaningful (imagine all unknown actors suddenly born in 1975).  
Possible alternatives:
- Keep it but fill missing values with `0` (or a placeholder like `'unknown'` if treated as text),  
- Or drop it entirely if we don’t plan to analyze by actor age.  
I lean toward **keeping it** (with a numeric placeholder like `0`) for now, just to preserve structure.

---

#### 🗓️ `endYear`
Originally, I wanted to drop `endYear` as well because it had **over 98% nulls**,  
but on second thought, it might still serve as an indicator of whether a show or title is still running.  
Instead of removing it completely, I’ll **replace nulls with `0`** to represent *“not ended”* or *“still active”*.  
That way, the column can be treated as a binary-like numeric field (`0` = still ongoing, otherwise actual end year).


# After reconsidering the cleaning, here we will proceed with 2nd cleaning approach using pandas

In [10]:
import pandas as pd
import numpy as np
import json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from scipy import stats
import tempfile
import sys

# === 1️⃣ Load your parquet ===
FILE_URL = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-25_160707_UTC/imdb_sample_100k.parquet"
try:
    df = pd.read_parquet(FILE_URL)
    print(f"Loaded dataframe: {df.shape[0]} rows, {df.shape[1]} columns")
except Exception as e:
    print(f"Failed to load parquet file: {e}")
    print("Using a small dummy dataframe to demonstrate the pipeline.")
    data = {
        'tconst': ['tt00001', 'tt00002', 'tt00003', 'tt00004'],
        'primaryTitle': ['Movie 1', 'Movie 2', 'Movie 3', 'Bad Series'],
        'genres': ['Action,Comedy', 'Drama', None, 'Drama'],
        'attributes': ['attr1', 'attr2', 'attr3', 'attr4'],
        'startYear': [2000, 2001, np.nan, 2010],
        'endYear': [2000, 2001, np.nan, 2005], # Added bad data point
        'runtimeMinutes': [120, np.nan, 90, 60],
        'isAdult': [0, 't', np.nan, 0],
        'characters': ['["Hero"]', '["Villain"]', np.nan, '["Anti-Hero"]'],
        'parentTconst': [np.nan, 'tt00001', np.nan, np.nan],
        'titleId': [np.nan, 'tt00002_aka', np.nan, np.nan],
        'ordering': [1, np.nan, 3, 1],
        'title': [np.nan, 'Movie 2 Alt', np.nan, np.nan],
        'isOriginalTitle': [np.nan, 0, np.nan, 1],
        'tconst_1': ['tt00001', np.nan, 'tt00003', 'tt00004'],
        'ordering_1': [1, np.nan, 3, 1],
        'nconst': ['nm0001', np.nan, 'nm0003', 'nm0004'],
        'nconst_1': ['nm0001', np.nan, 'nm0003', 'nm0004'],
        'averageRating': [8.5, 7.2, 9.0, np.nan], # Added null rating
        'numVotes': [1000, 500, 2000, np.nan],      # Added null votes
        'job': [np.nan, 'actor', 'director', np.nan], # Added job
        'deathYear': [np.nan, np.nan, 2020, np.nan] # Added deathYear
    }
    df = pd.DataFrame(data)

# === 2️⃣ Define transformers ===

class DropColumnTransformer(BaseEstimator, TransformerMixin):
    """Drops specified columns."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X):
        return X.drop(columns=self.columns, axis=1, errors='ignore')


class StringCleaner(BaseEstimator, TransformerMixin):
    """Strips whitespace, lowercases, and converts to string."""
    def __init__(self, columns): self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X):
        X2 = X.copy()
        for col in self.columns:
            if col in X2.columns:
                X2[col] = X2[col].astype(str).str.strip().str.lower().replace({'\\n':'', '\\t':''}, regex=True)
        return X2


class ListStringCleaner(BaseEstimator, TransformerMixin):
    """Cleans comma-separated list-strings (e.g., genres)."""
    # --- UPDATED: Added fill_value parameter ---
    def __init__(self, columns, separator=',', fill_value=''):
        self.columns = columns
        self.separator = separator
        self.fill_value = fill_value
    def fit(self, X, y=None): return self
    def transform(self, X):
        X2 = X.copy()
        for col in self.columns:
            if col in X2.columns:
                # --- UPDATED: Use self.fill_value instead of hardcoded '' ---
                X2[col] = X2[col].fillna(self.fill_value)
                X2[col] = X2[col].apply(
                    lambda s: self.separator.join(
                        [item.strip().lower() for item in str(s).split(self.separator) if item.strip()]
                    ) if s else self.fill_value if self.fill_value else '' # Handle case where fill_value is used
                )
        return X2


class JSONStringParser(BaseEstimator, TransformerMixin):
    """Parses columns containing string representations of JSON lists."""
    def __init__(self, columns): self.columns = columns
    def fit(self, X, y=None): return self
    def _parse(self, item):
        if pd.isna(item): return ''
        try:
            # Try to load as JSON
            parsed = json.loads(item)
            if isinstance(parsed, list):
                # Join list items into a comma-separated string
                return ','.join(str(i).strip().lower() for i in parsed if i)
        except Exception:
            # If not JSON, just treat as a regular string
            pass
        return str(item).strip().lower()
    def transform(self, X):
        X2 = X.copy()
        for col in self.columns:
            if col in X2.columns:
                X2[col] = X2[col].apply(self._parse)
        return X2


class NumericNoopOrCoerce(BaseEstimator, TransformerMixin):
    """Ensure numeric columns are numeric (coerce bad -> NaN)."""
    def __init__(self, columns): self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X):
        X2 = X.copy()
        for col in self.columns:
            if col in X2.columns:
                X2[col] = pd.to_numeric(X2[col], errors='coerce')
        return X2


class SpecialFillTransformer(BaseEstimator, TransformerMixin):
    """Handles specific rules for numeric fills and binary flags."""
    # --- UPDATED: Simplified __init__ ---
    def __init__(self, start_year_col, isadult_col, zero_fill_cols):
        self.start_year_col = start_year_col
        self.zero_fill_cols = zero_fill_cols
        self.isadult_col = isadult_col
        self.start_median_ = None

    def fit(self, X, y=None):
        if self.start_year_col in X.columns:
            # Ensure column is numeric before median
            numeric_start = pd.to_numeric(X[self.start_year_col], errors='coerce')
            self.start_median_ = numeric_start.median(skipna=True)
            if pd.isna(self.start_median_):
                self.start_median_ = 2000 # A reasonable fallback if all are null
        return self

    def transform(self, X):
        X2 = X.copy()
        
        # 1. Fill startYear: ffill + median fallback (as requested)
        if self.start_year_col in X2.columns:
            if self.start_median_ is None: # Should be set by fit, but as a safety
                self.start_median_ = 2000
            X2[self.start_year_col] = X2[self.start_year_col].ffill().fillna(self.start_median_).astype(int)
        
        # --- UPDATED: All special fills now use the zero_fill_cols list ---
        
        # 2. zero-fill all columns specified in the list (now includes endYear, ratings, etc.)
        print(f"Zero-filling columns: {self.zero_fill_cols}")
        for c in self.zero_fill_cols:
            if c in X2.columns:
                X2[c] = X2[c].fillna(0)
        
        # 3. normalize isAdult to 0/1
        if self.isadult_col in X2.columns:
            X2[self.isadult_col] = X2[self.isadult_col].fillna(0)
            X2[self.isadult_col] = X2[self.isadult_col].apply(lambda x: 1 if str(x).strip().lower() in ['1','true','t'] else 0)
        
        # 4. Final check for endYear < startYear (good practice)
        if 'startYear' in X2.columns and 'endYear' in X2.columns:
            # Find rows where endYear is not 0 (not 'active') AND is less than startYear
            mask = (X2['endYear'] != 0) & (X2['endYear'] < X2['startYear'])
            if mask.any():
                print(f"Fixing {mask.sum()} rows where original endYear < startYear.")
                X2.loc[mask, 'endYear'] = X2.loc[mask, 'startYear']
        
        return X2


class CustomOutlierRemover(BaseEstimator, TransformerMixin):
    """Removes rows based on Z-score of numeric columns."""
    def __init__(self, columns, threshold=4):
        self.columns = columns
        self.threshold = threshold
    def fit(self, X, y=None): return self
    def transform(self, X):
        X2 = X.copy()
        valid = [c for c in self.columns if c in X2.columns and pd.api.types.is_numeric_dtype(X2[c])]
        if not valid:
            print("No numeric columns for outlier removal.")
            return X2
        # Ensure data is float for zscore calculation
        z = np.abs(stats.zscore(X2[valid].astype(float), nan_policy='omit'))
        z = np.nan_to_num(z, nan=0) # Replace NaNs in z-scores with 0
        mask = (z < self.threshold).all(axis=1)
        print(f"Outlier removal: retaining {mask.sum()} of {len(X2)} rows.")
        return X2[mask]

# === 3️⃣ NEW TRANSFORMER: LeftoverNullFiller ===
class LeftoverNullFiller(BaseEstimator, TransformerMixin):
    """
    Fills leftover nulls from joins or sparse data.
    Fills specified string columns with 'unknown' (or other value).
    Fills specified numeric columns with 0 (or other value).
    """
    def __init__(self, string_cols, numeric_cols, string_fill='unknown', numeric_fill=0):
        self.string_cols = string_cols
        self.numeric_cols = numeric_cols
        self.string_fill = string_fill
        self.numeric_fill = numeric_fill

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X2 = X.copy()
        print(f"Running LeftoverNullFiller...")
        # Fill string columns
        for col in self.string_cols:
            if col in X2.columns:
                # Fill NaNs and ensure type is string
                X2[col] = X2[col].fillna(self.string_fill).astype(str)
        
        # Fill numeric columns
        for col in self.numeric_cols:
            if col in X2.columns:
                # Fill NaNs and ensure type is int
                X2[col] = X2[col].fillna(self.numeric_fill).astype(int)
        
        return X2

# === 4️⃣ Define column groups and build pipeline ===

DROP_COLS = ['attributes']
CLEAN_STRING_COLS = [
    'primaryTitle','originalTitle','title_akas','primaryName',
    'titleType','region','language','types','category'
]
LIST_STRING_COLS = ['genres','directors','writers','primaryProfession','knownForTitles']
JSON_STRING_COLS = ['characters']

# All numeric cols, including the ones that were null
NUMERIC_COLS = [
    'startYear','endYear','runtimeMinutes','seasonNumber','episodeNumber',
    'birthYear','deathYear','averageRating','numVotes',
    'ordering', 'isOriginalTitle', 'ordering_1' # Add new numeric-like cols here
]

# Define columns for the new filler
LEFTOVER_STRING_COLS = [
    'parentTconst', 'titleId', 'title', 
    'tconst_1', 'nconst', 'nconst_1',
    'job' # <-- ADDED 'job' HERE
]
LEFTOVER_NUMERIC_COLS = [
    'ordering', 'isOriginalTitle', 'ordering_1'
]


cleaning_pipeline = Pipeline(steps=[
    ('drop_cols', DropColumnTransformer(columns=DROP_COLS)),
    ('clean_strings', StringCleaner(columns=CLEAN_STRING_COLS)),
    # --- UPDATED: Pass fill_value='unknown' ---
    ('clean_lists', ListStringCleaner(columns=LIST_STRING_COLS, fill_value='unknown')),
    ('parse_json', JSONStringParser(columns=JSON_STRING_COLS)),
    
    # Coerce *all* potential numeric columns first
    ('coerce_numeric', NumericNoopOrCoerce(columns=NUMERIC_COLS)),
    
    # --- UPDATED: special_fills now uses the new logic ---
    ('special_fills', SpecialFillTransformer(
        start_year_col='startYear',
        isadult_col='isAdult',
        zero_fill_cols=[
            'runtimeMinutes', 'seasonNumber', 'episodeNumber', 'birthYear', # From your rules
            'endYear',               # From your rules
            'averageRating',         # To fix your complaint
            'numVotes',              # To fix yourG complaint
            'deathYear'              # <-- ADDED 'deathYear' HERE
        ]
    )),
    
    # This runs last to catch any remaining nulls in these specific columns
    ('fill_leftovers', LeftoverNullFiller(
        string_cols=LEFTOVER_STRING_COLS,
        numeric_cols=LEFTOVER_NUMERIC_COLS,
        string_fill='unknown',
        numeric_fill=0
    )),

    # Optional outlier removal (uncomment to use)
    # ('remove_outliers', CustomOutlierRemover(columns=['runtimeMinutes','numVotes','averageRating'], threshold=4))
])

# === 5️⃣ Apply cleaning ===
df_cleaned = cleaning_pipeline.fit_transform(df)
print(f"✅ Cleaning done: {df_cleaned.shape[0]} rows, {df_cleaned.shape[1]} columns")

# === 6️⃣ Check results ===
print("\n--- Null count per column ---")
nulls = df_cleaned.isna().sum()
if nulls.sum() == 0:
    print("🎉 All null values have been handled! 🎉")
else:
    print(nulls[nulls > 0])

print("\n--- Sample rows (post-cleaning) ---")
# Use display() if in a notebook, otherwise print()
if 'ipykernel' in sys.modules:
    display(df_cleaned.sample(min(10, len(df_cleaned)), random_state=42))
else:
    print(df_cleaned.sample(min(10, len(df_cleaned)), random_state=42))

# === 7️⃣ Save cleaned data ===
try:
    OUTPUT_FILE = tempfile.gettempdir() + "/sample_imdb_cleaned.parquet"
    df_cleaned.to_parquet(OUTPUT_FILE, index=False, engine='pyarrow')
    print(f"\n💾 Cleaned parquet saved as: {OUTPUT_FILE}")
except Exception as e:
    print(f"\nFailed to save parquet file: {e}")
    print("This can happen if 'pyarrow' is not installed. Try: pip install pyarrow")

Loaded dataframe: 100000 rows, 36 columns
Zero-filling columns: ['runtimeMinutes', 'seasonNumber', 'episodeNumber', 'birthYear', 'endYear', 'averageRating', 'numVotes', 'deathYear']
Running LeftoverNullFiller...
✅ Cleaning done: 100000 rows, 35 columns

--- Null count per column ---
🎉 All null values have been handled! 🎉

--- Sample rows (post-cleaning) ---


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,...,nconst,category,job,characters,nconst_1,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
75721,tt12528796,tvepisode,episode #1.333,episode #1.333,0,1938,0.0,0.0,drama,0.0,...,nm5426054,actor,unknown,,nm5426054,venu arvind,0.0,0.0,"actor,director,writer","tt13439272,tt0242256,tt0318993,tt2923826"
80184,tt10540368,short,jellyfish,jellyfish,0,2019,0.0,13.0,short,9.2,...,nm10792182,actor,unknown,caretaker,nm10792182,trinidad asensio robles,0.0,0.0,actor,tt10540368
19864,tt21104030,tvepisode,episode #2.15,episode #2.15,0,2007,0.0,0.0,"family,reality-tv",0.0,...,nm1101797,self,unknown,self - contestant,nm1101797,duncan james,1978.0,0.0,"actor,soundtrack,archive_footage","tt0112004,tt1470249,tt12870980,tt12115616"
76699,tt5907054,tvepisode,episode #1.3,episode #1.3,0,1979,0.0,0.0,unknown,0.0,...,nm0199858,writer,adaptation,,nm0199858,carmen daniels,0.0,2006.0,writer,"tt0211796,tt0229914,tt5378734,tt0214376"
92991,tt14400866,tvseries,a couple of cuckoos,kakkou no iinazuke,0,2022,2025.0,23.0,"animation,comedy,romance",6.8,...,nm5928101,actor,unknown,additional voices,nm5928101,jacob eiseman,0.0,0.0,actor,"tt3398540,tt10981954,tt9671916,tt13375866"
76434,tt0928280,tvepisode,privilege,privilege,0,2007,0.0,44.0,"crime,drama,mystery",8.1,...,nm0454236,actor,unknown,ernest foley,nm0454236,richard kind,1956.0,0.0,"actor,writer,soundtrack","tt2096673,tt1024648,tt1019452,tt0120623"
84004,tt22643502,tvepisode,episode dated 11 october 2022,episode dated 11 october 2022,0,2022,0.0,0.0,news,0.0,...,nm4834517,self,unknown,self - fox business correspondent,nm4834517,lauren simonetti,0.0,0.0,archive_footage,"tt9130562,tt3230032,tt7483086,tt3776548"
80917,tt0352167,movie,amme bhagavathi,amme bhagavathi,0,1987,0.0,0.0,unknown,6.1,...,nm0530818,composer,unknown,,nm0530818,m.s. viswanathan,1928.0,2015.0,"music_department,composer,actor","tt0154120,tt0432188,tt1441317,tt3400200"
60767,tt26998576,tvepisode,episode #1.288,episode #1.288,0,2023,0.0,0.0,"comedy,drama,romance",0.0,...,nm13810315,actor,unknown,subho,nm13810315,rishav chakraborty,0.0,0.0,actor,"tt35747590,tt36958465,tt25910670,tt26908758"
50074,tt12156036,tvepisode,episode #1.251,episode #1.251,0,2016,0.0,0.0,action,0.0,...,nm0849850,director,unknown,,nm0849850,imam tantowi,1946.0,0.0,"writer,director,art_director","tt3418604,tt0358793,tt1207735,tt0326595"



💾 Cleaned parquet saved as: /var/folders/zc/sdlj288n03ld3s3l_mzb9f0r0000gn/T/sample_imdb_cleaned.parquet


In [13]:
OUTPUT_FILE_CSV = tempfile.gettempdir() + "/sample_imdb_cleaned.csv"
df_cleaned.to_csv(OUTPUT_FILE_CSV, index=False)

In [11]:
df_cleaned.sample(30)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,...,nconst,category,job,characters,nconst_1,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
82847,tt9463208,tvepisode,episode dated 24 november 2018,episode dated 24 november 2018,0,2018,0.0,0.0,talk-show,0.0,...,nm1930056,writer,unknown,,nm1930056,francisco quintanar,1958.0,0.0,"director,writer,miscellaneous","tt0943381,tt0373497,tt1560991,tt0375383"
71587,tt26907800,tvepisode,episode #1.24,episode #1.24,0,1976,0.0,0.0,drama,0.0,...,nm0342533,actress,unknown,,nm0342533,amparo grisales,1956.0,0.0,"actress,producer","tt0450340,tt6809396,tt1585368,tt0257282"
69171,tt0282552,movie,steal,riders,0,2002,0.0,83.0,"action,crime,thriller",5.4,...,nm0565327,actor,unknown,frank,nm0565327,steven mccarthy,0.0,0.0,"actor,writer,producer","tt3230854,tt4571340,tt11525188,tt6236572"
69439,tt16440734,tvepisode,episode #1.257,episode #1.257,0,2022,0.0,0.0,drama,0.0,...,nm3333342,actor,unknown,melusi dlamini,nm3333342,zolisa xaluva,1981.0,0.0,actor,"tt7416536,tt21448346,tt13453828,tt12599346"
62654,tt14737288,tvepisode,episode #1.121,episode #1.121,0,1979,0.0,0.0,"drama,family,romance",0.0,...,nm12559942,editor,unknown,,nm12559942,pandidurai,0.0,0.0,editor,tt13570010
37769,tt33455973,tvepisode,episode #1.408,episode #1.408,0,1970,0.0,30.0,news,0.0,...,nm16528122,self,unknown,self - reporter,nm16528122,ed goetze,0.0,0.0,unknown,"tt33384753,tt33447764"
76330,tt6140782,tvepisode,episode dated 18 october 2016,episode dated 18 october 2016,0,2016,0.0,0.0,news,0.0,...,nm2166842,self,unknown,self - fox news chief legal correspondent,nm2166842,shannon bream,1970.0,0.0,"writer,producer,actress","tt4649466,tt14624264,tt7406432,tt0770614"
67627,tt4831952,short,a motel story,a motel story,0,2017,0.0,13.0,"crime,drama,short",0.0,...,nm1428476,director,unknown,,nm1428476,nick epstein,0.0,0.0,"visual_effects,director,writer","tt0437086,tt1630029,tt0206634,tt8355738"
89842,tt14860046,tvseries,casa zurli,casa zurli,0,2020,0.0,45.0,family,5.1,...,nm2658421,self,unknown,self - guest,nm2658421,marius urzica,0.0,0.0,actor,"tt0464955,tt14860046,tt1027229"
41257,tt8028800,tvepisode,episode #1.75,episode #1.75,0,2018,0.0,0.0,drama,0.0,...,nm0260778,actor,unknown,vítor duque,nm0260778,luís esparteiro,1959.0,0.0,"actor,miscellaneous,archive_footage","tt6183736,tt8960296,tt0135113,tt1980858"


In [None]:
# === 7️⃣ Save cleaned data ===
try:
    OUTPUT_FILE = tempfile.gettempdir() + "/3sample_imdb_cleaned.parquet"
    df_cleaned.to_parquet(OUTPUT_FILE, index=False, engine='pyarrow')
    print(f"\n💾 Cleaned parquet saved as: {OUTPUT_FILE}")
except Exception as e:
    print(f"\nFailed to save parquet file: {e}")
    print("This can happen if 'pyarrow' is not installed. Try: pip install pyarrow")

# Code in duckdb based on the instruction above

In [None]:
import duckdb
import tempfile
import sys

# === 1️⃣ Define file paths ===
FILE_URL = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-25_165557_UTC/imdb_cleaned_for_colleagues.parquet"
OUTPUT_FILE_CSV = tempfile.gettempdir() + "/sample_imdb_cleaned_duckdb.csv"

# === 2️⃣ Define column groups (for SQL) ===
# These match your Python lists
CLEAN_STRING_COLS = [
    'primaryTitle','originalTitle','title_akas','primaryName',
    'titleType','region','language','types','category'
]
LIST_STRING_COLS = ['genres','directors','writers','primaryProfession','knownForTitles']
JSON_STRING_COLS = ['characters']
ZERO_FILL_COLS = [
    'runtimeMinutes', 'seasonNumber', 'episodeNumber', 'birthYear',
    'endYear', 'averageRating', 'numVotes', 'deathYear'
]
LEFTOVER_STRING_COLS = [
    'parentTconst', 'titleId', 'title', 
    'tconst_1', 'nconst', 'nconst_1', 'job'
]
LEFTOVER_NUMERIC_COLS = [
    'ordering', 'isOriginalTitle', 'ordering_1'
]

# === 3️⃣ Build the cleaning query ===
# We build the query dynamically to make it easier to maintain
def build_cleaning_query(file_url, output_csv):
    
    # --- Helper transformations ---
    
    # `lower(trim(col))`
    def sql_clean_string(col):
        return f"lower(trim({col}::VARCHAR)) AS {col}"

    # `list_aggr(list_transform(string_split(COALESCE(col, 'unknown'), ','), ...), ',')`
    def sql_clean_list(col):
        return f"list_aggr(list_transform(string_split(COALESCE({col}::VARCHAR, 'unknown'), ','), item -> lower(trim(item))), ',') AS {col}"

    # `COALESCE(try_cast(...), 0)`
    def sql_zero_fill(col):
        # averageRating is float, others are int
        cast_type = "DOUBLE" if col == 'averageRating' else "INTEGER"
        return f"COALESCE(try_cast({col} AS {cast_type}), 0) AS {col}"

    # `COALESCE(col, 'unknown')`
    def sql_leftover_string(col):
        return f"COALESCE({col}::VARCHAR, 'unknown') AS {col}"

    # `COALESCE(try_cast(...), 0)`
    def sql_leftover_numeric(col):
        return f"COALESCE(try_cast({col} AS INTEGER), 0) AS {col}"

    # --- Start building the main query ---
    
    query = f"""
    -- Load httpfs to read remote parquet
    INSTALL httpfs;
    LOAD httpfs;

    -- Create a temporary table with a row_id to replicate pandas.ffill()
    CREATE TEMPORARY TABLE base_data AS
    SELECT 
        row_number() OVER () AS _row_id, 
        * FROM read_parquet('{file_url}');

    -- Calculate median startYear (for ffill fallback)
    CREATE TEMPORARY TABLE settings AS
    SELECT 
        median(try_cast(startYear AS INTEGER)) AS median_start_year 
    FROM base_data;

    -- Define the full cleaning pipeline as a series of CTEs
    WITH 
    
    -- Step 2 & 5: Coerce types, clean strings
    type_coerce AS (
        SELECT
            _row_id,
            tconst,

            -- String Cleaning
            {', '.join([sql_clean_string(c) for c in CLEAN_STRING_COLS])},

            -- Raw List/JSON columns (to be cleaned next)
            {', '.join(LIST_STRING_COLS)},
            {', '.join(JSON_STRING_COLS)},

            -- Raw Numeric columns (to be filled next)
            try_cast(startYear AS INTEGER) AS startYear,
            {', '.join(ZERO_FILL_COLS)}, -- Select raw columns
            
            -- isAdult (special case)
            isAdult AS isAdult_raw,

            -- Raw Leftover columns
            {', '.join(LEFTOVER_STRING_COLS)},
            {', '.join(LEFTOVER_NUMERIC_COLS)}
            
        FROM base_data
        -- 'attributes' column is dropped by not being selected
    ),

    -- Step 3 & 4: Clean Lists and JSON
    clean_lists_json AS (
        SELECT
            *,
            -- List cleaning logic
            {', '.join([sql_clean_list(c) for c in LIST_STRING_COLS])},

            -- JSON parsing logic for 'characters'
            COALESCE(
                list_aggr(list_transform(json_extract(try_cast(characters AS JSON), '$[*]'), e -> lower(trim(e::VARCHAR))), ','),
                lower(trim(COALESCE(characters, '')))
            ) AS characters

        -- Exclude raw columns
        FROM type_coerce
        EXCLUDE ({', '.join([c + '_raw' for c in LIST_STRING_COLS])}, characters_raw)
    ),
    
    -- Step 6a: SpecialFillTransformer - ffill for startYear
    ffill_start_year AS (
        SELECT
            *,
            -- Replicate pandas ffill() using window function, ordered by implicit row ID
            LAG(startYear, 1) IGNORE NULLS OVER (ORDER BY _row_id) AS startYear_ffilled
        FROM clean_lists_json
    ),

    -- Step 6b: SpecialFillTransformer - All other fills
    special_fills AS (
        SELECT
            *,
            -- 1. Fill startYear: ffill + median fallback
            COALESCE(startYear, startYear_ffilled, (SELECT median_start_year FROM settings)) AS startYear_filled,

            -- 2. Zero-fill columns
            {', '.join([sql_zero_fill(c) for c in ZERO_FILL_COLS])},

            -- 3. Normalize isAdult
            COALESCE(CASE WHEN lower(trim(isAdult_raw::VARCHAR)) IN ('1', 'true', 't') THEN 1 ELSE 0 END, 0) AS isAdult

        FROM ffill_start_year
        -- Exclude raw/intermediate columns
        EXCLUDE (startYear, startYear_ffilled, isAdult_raw, {', '.join(ZERO_FILL_COLS)})
    ),
    
    -- Step 6c: Fix endYear < startYear
    fix_end_year AS (
        SELECT
            *,
            CASE
                WHEN endYear != 0 AND endYear < startYear_filled
                THEN startYear_filled
                ELSE endYear
            END AS endYear_fixed
        FROM special_fills
        EXCLUDE (endYear) -- drop old endYear
    ),

    -- Step 7: LeftoverNullFiller and final column selection
    final_cleaned AS (
        SELECT
            -- Select all cleaned columns in desired order
            tconst,
            primaryTitle,
            originalTitle,
            genres,
            directors,
            writers,
            characters,
            startYear_filled AS startYear,
            endYear_fixed AS endYear,
            runtimeMinutes,
            isAdult,
            titleType,
            primaryName,
            birthYear,
            deathYear,
            primaryProfession,
            knownForTitles,
            averageRating,
            numVotes,
            seasonNumber,
            episodeNumber,
            region,
            language,
            types,
            category,

            -- Fill leftovers
            {', '.join([sql_leftover_string(c) for c in LEFTOVER_STRING_COLS])},
            {', '.join([sql_leftover_numeric(c) for c in LEFTOVER_NUMERIC_COLS])}

        FROM fix_end_year
    )
    
    -- === 4️⃣ Save to CSV ===
    COPY (SELECT * FROM final_cleaned) TO '{output_csv}' (HEADER, DELIMITER ',');
    """
    
    return query

# === 5️⃣ Execute the pipeline ===
con = None
try:
    # Connect to an in-memory database
    con = duckdb.connect(database=':memory:')
    print("DuckDB connection established.")
    
    # Build the full query
    full_query = build_cleaning_query(FILE_URL, OUTPUT_FILE_CSV)
    
    # print(full_query) # Uncomment to debug the generated SQL
    
    print("Starting cleaning pipeline...")
    con.execute(full_query)
    print("✅ Cleaning pipeline complete.")
    
    print(f"\n💾 Cleaned CSV saved as: {OUTPUT_FILE_CSV}")

    # === 6️⃣ Check results ===
    print("\n--- Sample rows (post-cleaning) ---")
    con.execute("SELECT * FROM final_cleaned LIMIT 10").df().info()
    print(con.execute("SELECT * FROM final_cleaned LIMIT 10").df())

except Exception as e:
    print(f"\nAn error occurred: {e}", file=sys.stderr)
finally:
    if con:
        con.close()
        print("DuckDB connection closed.")


1

In [2]:
import pandas as pd

# This reads the Parquet file and returns a Pandas DataFrame
df = pd.read_parquet('https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-25_165557_UTC/imdb_cleaned_for_colleagues.parquet')

# You can specify the engine, though 'auto' (default) usually works fine:
# df = pd.read_parquet('your_file_name.parquet', engine='pyarrow')

# To read only a subset of columns (efficiently!):
# df = pd.read_parquet('your_file_name.parquet', columns=['col1', 'col2'])


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   startYear          100000 non-null  float64
 1   endYear            100000 non-null  float64
 2   runtimeMinutes     100000 non-null  float64
 3   averageRating      100000 non-null  float64
 4   numVotes           100000 non-null  float64
 5   seasonNumber       100000 non-null  float64
 6   episodeNumber      100000 non-null  float64
 7   birthYear          100000 non-null  float64
 8   deathYear          100000 non-null  float64
 9   titleType          100000 non-null  object 
 10  isAdult            100000 non-null  int64  
 11  region             79036 non-null   object 
 12  language           64190 non-null   object 
 13  types              33852 non-null   object 
 14  category           99111 non-null   object 
 15  tconst             100000 non-null  object 
 16  pri

In [None]:
# duckdb_clean_final_by_rules.py
# Run in your Azure notebook cell. Adjust INPUT_URL and OUTPUT_PQ if needed.

import duckdb
import math
from pprint import pprint

# ---------------- CONFIG ----------------
INPUT_URL = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-22_105430_UTC/imdb_merged_duckdb.parquet"
OUTPUT_PQ = "imdb_cleaned_final_by_rules.parquet"
TEXT_FILL = "unknown"
# Columns to drop per your decision
DROP_COLS = {"endYear", "deathYear", "attributes", "job", "averageRating", "numVotes"}
# Columns considered list-like
LIST_COLS = {"genres", "directors", "writers", "primaryProfession", "knownForTitles"}
CHAR_COLS = {"characters"}
# Numeric columns to impute with median
NUMERIC_IMPUTE_COLS = ["startYear", "runtimeMinutes", "seasonNumber", "episodeNumber", "birthYear"]
# ----------------------------------------

def q(col: str) -> str:
    # safe quoting for identifiers
    return '"' + col.replace('"', '""') + '"'

con = duckdb.connect()

# optional: limit threads in shared kernel
try:
    con.execute("PRAGMA threads=2;")
except Exception:
    pass

print("Registering parquet as view: imdb_raw")
con.execute(f"CREATE OR REPLACE VIEW imdb_raw AS SELECT * FROM read_parquet('{INPUT_URL}');")

# Inspect schema
cols_info = con.execute("DESCRIBE imdb_raw").fetchall()
print(f"Detected {len(cols_info)} columns.")
pprint(cols_info[:40])

# Compute medians for numeric impute candidates (TRY_CAST to double)
medians = {}
for col in NUMERIC_IMPUTE_COLS:
    try:
        sql = f"""
        SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY TRY_CAST({q(col)} AS DOUBLE))
        FROM imdb_raw
        WHERE TRY_CAST({q(col)} AS DOUBLE) IS NOT NULL
        """
        res = con.execute(sql).fetchone()
        med = res[0] if res and res[0] is not None else 0.0
        if med is None or (isinstance(med, float) and (math.isnan(med) or math.isinf(med))):
            med = 0.0
    except Exception:
        med = 0.0
    medians[col] = float(med)

print("Medians to use for imputation:")
pprint(medians)

# Build select parts following rules
select_parts = []
for col, coltype in cols_info:
    # skip dropped columns
    if col in DROP_COLS:
        print(f" -> Dropping column: {col}")
        continue

    # characters: clean JSON-like arrays -> empty string if missing
    if col in CHAR_COLS:
        select_parts.append(
            f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' OR trim({q(col)}) = '' THEN '' "
            f"ELSE lower(regexp_replace(regexp_replace(regexp_replace({q(col)}, '\\\\[|\\\\]|\"', '', 'g'), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) END AS {q(col)}"
        )
        continue

    # list-like columns: normalize commas, lowercase, fill 'unknown' when missing
    if col in LIST_COLS:
        select_parts.append(
            f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' OR trim({q(col)}) = '' THEN '{TEXT_FILL}' "
            f"ELSE lower(regexp_replace(regexp_replace({q(col)}, '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) END AS {q(col)}"
        )
        continue

    # isAdult -> ensure 0/1 integer
    if col == "isAdult":
        # treat 'true'/'false' or numeric; fallback to 0
        select_parts.append(
            f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' THEN 0 "
            f"WHEN lower(trim(CAST({q(col)} AS VARCHAR))) IN ('1','true','t','yes') THEN 1 ELSE 0 END AS {q(col)}"
        )
        continue

    # numeric imputation for chosen numeric columns
    if col in medians:
        med = medians[col]
        # treat year-like and count-like as BIGINT
        if col.lower().endswith("year") or col.lower().endswith("number") or 'season' in col.lower() or 'episode' in col.lower():
            select_parts.append(f"COALESCE(TRY_CAST({q(col)} AS BIGINT), {int(med)}) AS {q(col)}")
        else:
            select_parts.append(f"COALESCE(TRY_CAST({q(col)} AS DOUBLE), {float(med)}) AS {q(col)}")
        continue

    # For any remaining column: treat text as 'unknown' when missing; lowercase/trim
    # If column is numeric-like (declared), try to cast; else fallback to text handling
    declared_type = (coltype or "").lower()
    if any(t in declared_type for t in ["tinyint","smallint","integer","int","bigint","decimal","numeric","float","double","real"]):
        # numeric declared but not in medians list -> coerce to double and coalesce to 0
        select_parts.append(f"COALESCE(TRY_CAST({q(col)} AS DOUBLE), 0) AS {q(col)}")
    else:
        select_parts.append(
            f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' OR trim({q(col)}) = '' THEN '{TEXT_FILL}' ELSE lower(trim({q(col)})) END AS {q(col)}"
        )

# Assemble final SQL
final_select_sql = ",\n    ".join(select_parts)
create_view_sql = f"""
CREATE OR REPLACE VIEW imdb_cleaned_by_rules_view AS
SELECT
    {final_select_sql}
FROM imdb_raw;
"""

print("Creating view imdb_cleaned_by_rules_view ... (this applies all transforms)")
con.execute(create_view_sql)
print("View created.")

# Materialize to table and export parquet
print("Materializing table imdb_cleaned_by_rules ...")
con.execute("CREATE OR REPLACE TABLE imdb_cleaned_by_rules AS SELECT * FROM imdb_cleaned_by_rules_view;")
print("Exporting to parquet:", OUTPUT_PQ)
con.execute(f"COPY (SELECT * FROM imdb_cleaned_by_rules) TO '{OUTPUT_PQ}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');")
print("Export finished:", OUTPUT_PQ)

# Verification: ensure no NULLs remain (print any columns with non-zero nulls)
print("Verifying null counts per column...")
cols_after = [c for c, t in cols_info if c not in DROP_COLS]
null_exprs = ", ".join([f"SUM(CASE WHEN {q(c)} IS NULL THEN 1 ELSE 0 END) AS {c}_nulls" for c in cols_after])
nulls_row = con.execute(f"SELECT {null_exprs} FROM imdb_cleaned_by_rules").fetchone()
nulls_map = {cols_after[i]: nulls_row[i] for i in range(len(cols_after))}
total_rows = con.execute("SELECT COUNT(*) FROM imdb_cleaned_by_rules").fetchone()[0]

non_zero = {k: v for k, v in nulls_map.items() if v and v > 0}
if non_zero:
    print("⚠️ Columns still containing NULLs (unexpected):")
    for k, v in non_zero.items():
        pct = v / total_rows * 100 if total_rows > 0 else 0
        print(f" - {k}: {v} nulls ({pct:.4f}%)")
else:
    print("✅ Success — no NULLs in the materialized table (after applied rules).")

# show a quick sample and some stats for the imputed numeric cols
print("\nSample rows:")
print(con.execute("SELECT * FROM imdb_cleaned_by_rules LIMIT 10").fetchdf())

print("\nNumeric summary for imputed columns:")
print(con.execute("SELECT AVG(startYear) AS avg_startYear, MEDIAN(startYear) AS med_startYear, "
                  "AVG(runtimeMinutes) AS avg_runtime, MEDIAN(runtimeMinutes) AS med_runtime, "
                  "AVG(birthYear) AS avg_birthYear, MEDIAN(birthYear) AS med_birthYear "
                  "FROM imdb_cleaned_by_rules").fetchdf())

con.close()
print("All done. Output file:", OUTPUT_PQ)


# I do it locally, with the usage of the terminal, but unfortunately the cleaning is poor , and we need to improve 

In [6]:
# duckdb_full_clean_no_nulls.py
# Run in your Azure kernel cell. Adjust INPUT_URL and OUTPUT_PATH as needed.

import duckdb
import math
import os
from pprint import pprint

# -------------- CONFIG --------------
INPUT_URL = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-25_160707_UTC/imdb_sample_100k.parquet"   # change to your sample or the real parquet path/URL
OUTPUT_PARQUET = "imdb_merged_cleaned_duckdb.parquet"
# Columns that you know should be treated as list-like or JSON-like (customize if needed)
LIST_COLS = {'genres', 'directors', 'writers', 'primaryProfession', 'knownForTitles'}
CHAR_COLS = {'characters'}   # JSON-like arrays stored as strings
# Safe text default:
TEXT_FILL = "unknown"
# ------------------------------------

con = duckdb.connect()

# Limit threads if you're inside a constrained Azure shared kernel (optional)
try:
    con.execute("PRAGMA threads=2;")
except Exception:
    pass

print("Registering parquet view:", INPUT_URL)
# Create a view on the parquet file so we can query it without loading into memory
con.execute(f"CREATE OR REPLACE VIEW imdb_raw AS SELECT * FROM read_parquet('{INPUT_URL}');")

# Inspect schema
cols_info = con.execute("DESCRIBE imdb_raw").fetchall()
all_cols = [(r[0], r[1]) for r in cols_info]
print(f"Detected {len(all_cols)} columns. Sample:")
pprint(all_cols[:40])

# Helper to quote identifiers safely
def q(col):
    return '"' + col.replace('"', '""') + '"'

# Basic duckdb type heuristics
def is_numeric_type(duck_type: str):
    t = (duck_type or "").lower()
    return any(x in t for x in ["tinyint","smallint","integer","int","bigint","decimal","numeric","float","double","real"])

def is_string_type(duck_type: str):
    t = (duck_type or "").lower()
    return any(x in t for x in ["varchar","text","string","char"])

# -----------------------
# 1) Compute medians for numeric-ish columns (using TRY_CAST)
# -----------------------
numeric_candidates = [col for col, typ in all_cols if is_numeric_type(typ) or col in LIST_COLS or col in CHAR_COLS or True]
# We'll determine numeric columns more strictly below; safer to check all declared numeric-like
numeric_cols = [col for col, typ in all_cols if is_numeric_type(typ)]
# But sometimes numeric columns are strings in the file (e.g. 'numVotes' typed as varchar) -> detect common-numeric names
possible_numeric_by_name = {'numVotes','averageRating','runtimeMinutes','startYear','endYear','birthYear','deathYear','seasonNumber','episodeNumber'}
for col, typ in all_cols:
    if col in possible_numeric_by_name and col not in numeric_cols:
        numeric_cols.append(col)

numeric_cols = list(dict.fromkeys(numeric_cols))  # unique and keep order
print("Numeric columns to compute medians for:", numeric_cols)

medians = {}
for col in numeric_cols:
    try:
        # Try computing median based on values that successfully cast to double
        sql = f"""
        SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY TRY_CAST({q(col)} AS DOUBLE))
        FROM imdb_raw
        WHERE TRY_CAST({q(col)} AS DOUBLE) IS NOT NULL
        """
        res = con.execute(sql).fetchone()
        med = res[0] if res and res[0] is not None else 0.0
        if med is None or (isinstance(med, float) and (math.isnan(med) or math.isinf(med))):
            med = 0.0
    except Exception:
        med = 0.0
    medians[col] = float(med)
print("Computed medians (numeric defaults):")
pprint(medians)

# -----------------------
# 2) Build safe SELECT expressions for every column (explicit)
# -----------------------
select_parts = []
for col, coltype in all_cols:
    if col in LIST_COLS:
        # Normalize list strings (lower, trim spaces around commas), replace '\N' or NULL with TEXT_FILL
        # We keep TEXT_FILL (unknown) for lists too per requirement of no nulls
        if is_string_type(coltype):
            expr = f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' OR trim({q(col)}) = '' THEN '{TEXT_FILL}' ELSE lower(regexp_replace(regexp_replace({q(col)}, '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) END AS {q(col)}"
        else:
            # If underlying type is not string, cast to VARCHAR first
            expr = f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' THEN '{TEXT_FILL}' WHEN TRY_CAST({q(col)} AS DOUBLE) IS NOT NULL THEN lower(regexp_replace(regexp_replace(CAST(TRY_CAST({q(col)} AS DOUBLE) AS VARCHAR), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) ELSE '{TEXT_FILL}' END AS {q(col)}"
        select_parts.append(expr)
        continue

    if col in CHAR_COLS:
        # remove [ ] " characters and normalize commas, fallback to empty string (or TEXT_FILL if you prefer)
        if is_string_type(coltype):
            expr = f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' OR trim({q(col)}) = '' THEN '' ELSE lower(regexp_replace(regexp_replace(regexp_replace({q(col)}, '\\\\[|\\\\]|\"', '', 'g'), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) END AS {q(col)}"
        else:
            expr = f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' THEN '' WHEN TRY_CAST({q(col)} AS DOUBLE) IS NOT NULL THEN lower(regexp_replace(regexp_replace(regexp_replace(CAST(TRY_CAST({q(col)} AS DOUBLE) AS VARCHAR), '\\\\[|\\\\]|\"', '', 'g'), '\\\\s*,\\\\s*', ',', 'g'), '^,+|,+$', '', 'g')) ELSE '' END AS {q(col)}"
        select_parts.append(expr)
        continue

    # Numeric handling: TRY_CAST to double, COALESCE to median default
    if col in medians:
        med = medians[col]
        # If column looks like integer (year like) keep BIGINT cast else double
        if col.lower().endswith("year") or col.lower().endswith("number") or 'season' in col.lower() or 'episode' in col.lower() or col.lower().endswith('votes'):
            # use BIGINT / integer fallback where possible
            expr = f"COALESCE(TRY_CAST({q(col)} AS BIGINT), {int(med)}) AS {q(col)}"
        else:
            expr = f"COALESCE(TRY_CAST({q(col)} AS DOUBLE), {float(med)}) AS {q(col)}"
        select_parts.append(expr)
        continue

    # For declared string columns -> replace '\N' and null/empty with TEXT_FILL
    if is_string_type(coltype):
        expr = f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' OR trim({q(col)}) = '' THEN '{TEXT_FILL}' ELSE lower(trim({q(col)})) END AS {q(col)}"
        select_parts.append(expr)
        continue

    # Fallback: try casting to double, else treat as text
    expr = (f"CASE WHEN {q(col)} IS NULL OR {q(col)} = '\\\\N' THEN '{TEXT_FILL}' "
            f"WHEN TRY_CAST({q(col)} AS DOUBLE) IS NOT NULL THEN TRY_CAST({q(col)} AS DOUBLE) ELSE '{TEXT_FILL}' END AS {q(col)}")
    select_parts.append(expr)

# Final assembled SQL
final_select_sql = ",\n    ".join(select_parts)
create_view_sql = f"""
CREATE OR REPLACE VIEW imdb_cleaned_full AS
SELECT
    {final_select_sql}
FROM imdb_raw;
"""
# Debug: show first 1200 chars to sanity-check
print("CREATE VIEW imdb_cleaned_full (snippet):")
print(create_view_sql[:1200])

# -----------------------
# 3) Execute view creation
# -----------------------
con.execute(create_view_sql)
print("View imdb_cleaned_full created.")

# -----------------------
# 4) Materialize (create table) and export to Parquet (streamed)
# -----------------------
# Materialize to a table so we can run verification queries quickly
print("Creating table imdb_no_nulls (materialized) ...")
con.execute("CREATE OR REPLACE TABLE imdb_no_nulls AS SELECT * FROM imdb_cleaned_full;")
print("Table imdb_no_nulls created. Now exporting to Parquet:", OUTPUT_PARQUET)
con.execute(f"COPY (SELECT * FROM imdb_no_nulls) TO '{OUTPUT_PARQUET}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');")
print("Export complete:", OUTPUT_PARQUET)

# -----------------------
# 5) VERIFY: ensure ZERO NULLS across all columns
# -----------------------
print("Verifying null counts for every column...")
cols = [c for c, t in all_cols]

# Build null count expressions safely (avoid alias quoting issues)
null_exprs = ",\n".join([
    f"SUM(CASE WHEN {q(col)} IS NULL THEN 1 ELSE 0 END) AS {col}_nulls"
    for col in cols
])
nulls_sql = f"SELECT {null_exprs} FROM imdb_no_nulls;"

nulls_row = con.execute(nulls_sql).fetchone()

# Create mapping col -> null count
nulls_map = {cols[i]: nulls_row[i] for i in range(len(cols))}
non_zero = {k: v for k, v in nulls_map.items() if v and v > 0}

print("Columns with non-zero null counts (should be empty):")
if non_zero:
    for k, v in non_zero.items():
        print(f"❌ {k}: {v}")
else:
    print("✅ Success — absolutely no nulls found in imdb_no_nulls table.")

# Close connection
con.close()


Registering parquet view: https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-25_160707_UTC/imdb_sample_100k.parquet
Detected 36 columns. Sample:
[('tconst', 'VARCHAR'),
 ('titleType', 'VARCHAR'),
 ('primaryTitle', 'VARCHAR'),
 ('originalTitle', 'VARCHAR'),
 ('isAdult', 'BIGINT'),
 ('startYear', 'BIGINT'),
 ('endYear', 'VARCHAR'),
 ('runtimeMinutes', 'BIGINT'),
 ('genres', 'VARCHAR'),
 ('averageRating', 'DOUBLE'),
 ('numVotes', 'BIGINT'),
 ('directors', 'VARCHAR'),
 ('writers', 'VARCHAR'),
 ('parentTconst', 'VARCHAR'),
 ('seasonNumber', 'BIGINT'),
 ('episodeNumber', 'BIGINT'),
 ('titleId', 'VARCHAR'),
 ('ordering', 'BIGINT'),
 ('title', 'VARCHAR'),
 ('region', 'VARCHAR'),
 ('language', 'VARCHAR'),
 ('types', 'VARCHAR'),
 ('attributes', 'VARCHAR'),
 ('isOriginalTitle', 'BIGINT'),
 ('tconst_1', 'VARCHAR'),
 ('ordering_1', 'BIGINT'),
 ('nconst', 'VARCHAR'),
 ('category', 'VARCHAR'),
 ('job', 'VARCHAR'),
 ('characters', 'VARCH

# Some random overview

In [8]:
import duckdb
con = duckdb.connect()

# 1) show a quick sample
print("Sample rows (limit 20):")
display(con.execute("SELECT * FROM imdb_merged_cleaned_duckdb.parquet LIMIT 20").fetchdf())

# 2) view schema / types
print("\nDESCRIBE imdb_merged_cleaned_duckdb.parquet:")
print(con.execute("DESCRIBE imdb_merged_cleaned_duckdb.parquet").fetchall())

# 3) row count
print("\nTotal rows in view:")
print(con.execute("SELECT COUNT(*) FROM imdb_merged_cleaned_duckdb.parquet").fetchone()[0])

# 4) null counts per column (fast aggregate)
print("\nNull counts per column (shows columns with >0 nulls):")
# build query dynamically
cols = [row[0] for row in con.execute("DESCRIBE imdb_merged_cleaned_duckdb.parquet").fetchall()]
null_exprs = ",\n".join([f"SUM(CASE WHEN {duck_col} IS NULL THEN 1 ELSE 0 END) AS {duck_col}_nulls"
                         for duck_col in cols])
sql_nulls = f"SELECT {null_exprs} FROM imdb_merged_cleaned_duckdb.parquet;"
null_counts = con.execute(sql_nulls).fetchdf()
# transpose for readability
null_counts_t = null_counts.T
null_counts_t.columns = ['null_count']
display(null_counts_t[null_counts_t['null_count']>0].sort_values('null_count', ascending=False).head(200))

# 5) percent nulls for the top offenders
print("\nTop 30 percent missing (approx):")
total = con.execute("SELECT COUNT(*) FROM imdb_merged_cleaned_duckdb.parquet").fetchone()[0]
pct_rows = []
for c in cols:
    nulls = con.execute(f"SELECT SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) FROM imdb_merged_cleaned_duckdb.parquet").fetchone()[0]
    pct = (nulls / total * 100) if total>0 else 0
    pct_rows.append((c, nulls, pct))
pct_rows = sorted(pct_rows, key=lambda x: x[1], reverse=True)
for c, nulls, pct in pct_rows[:30]:
    print(f"{c:30s} | nulls={nulls:10d} | pct={pct:6.3f}%")

# 6) Show non-castable examples for numeric columns (common cause of unexpected nulls)
NUMERIC_CHECK = ['averageRating','numVotes','runtimeMinutes','startYear']
for c in NUMERIC_CHECK:
    if c in cols:
        q = f"SELECT {c} FROM imdb_merged_cleaned_duckdb.parquet WHERE {c} IS NOT NULL AND TRY_CAST({c} AS DOUBLE) IS NULL LIMIT 50;"
        sample_bad = con.execute(q).fetchdf()
        print(f"\nNon-castable examples for {c} (up to 50 rows):")
        display(sample_bad)

# 7) Show rows that contain literal '\N' for string columns (common issue)
STRING_CHECK = ['primaryTitle','primaryName','title','region','language','genres','directors','writers']
for c in STRING_CHECK:
    if c in cols:
        q = f"SELECT {c}, COUNT(*) as cnt FROM imdb_merged_cleaned_duckdb.parquet WHERE {c} = '\\\\N' GROUP BY {c} LIMIT 10;"
        res = con.execute(q).fetchall()
        if res:
            print(f"\nLiteral '\\N' occurrences in column {c}:")
            print(res)

con.close()


Sample rows (limit 20):


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,...,nconst,category,job,characters,nconst_1,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt17755732,tvepisode,episode #1.210,episode #1.210,0.0,2020,2016,54.0,"drama,fantasy,romance",6.7,...,nm12909343,actress,unknown,[citra marisca],nm12909343,harini sondakh,1961,2007,actress,"tt15368302,tt13984562,tt28590286"
1,tt34507138,tvepisode,episode #1.335,episode #1.335,0.0,2024,2016,45.0,"drama,romance",6.7,...,nm4137808,actor,unknown,,nm4137808,rajeev parameshwar,1961,2007,actor,"tt3605606,tt2796978,tt1754332,tt10189448"
2,tt0207871,tvseries,buccaneer,buccaneer,0.0,1980,2016,50.0,"adventure,drama",6.8,...,nm0204096,actor,unknown,[accountant],nm0204096,geoffrey davion,1940,1996,actor,"tt0087749,tt0072566,tt0090852,tt0065290"
3,tt28210850,short,"howdy, comrade!","howdy, comrade!",0.0,2023,2016,6.0,"comedy,short",6.7,...,nm12859469,producer,producer,,nm12859469,jax maloney,1961,2007,"actor,writer,director","tt28054598,tt20223646,tt32573807,tt15741906"
4,tt7121444,tvepisode,episode #1.149,episode #1.149,0.0,1994,2016,60.0,"drama,romance",6.7,...,nm1018021,producer,producer,,nm1018021,maría josé fuentebuena,1961,2007,"producer,miscellaneous","tt0396300,tt0227896,tt6556846,tt10971476"
5,tt9188364,tvepisode,episode #1.29,episode #1.29,0.0,2015,2016,45.0,fantasy,6.7,...,nm1387873,director,unknown,,nm1387873,darnel villaflor,1961,2007,"director,miscellaneous,producer","tt12275096,tt8528294,tt1836451,tt16154940"
6,tt27803650,tvepisode,episode #5.115,episode #5.115,0.0,2021,2016,180.0,reality-tv,6.7,...,nm14860922,director,unknown,,nm14860922,alessio pollacci,1961,2007,director,"tt6078678,tt0261474,tt36895482,tt31109465"
7,tt29929538,tvepisode,episode #3.16,episode #3.16,0.0,2012,2016,45.0,"animation,family",6.7,...,nm1997137,writer,writer,,nm1997137,franck salomé,1965,2007,"writer,director","tt0878817,tt0804234,tt9636800,tt0414746"
8,tt1418612,tvepisode,episode dated 17 april 2009,episode dated 17 april 2009,0.0,2009,2016,45.0,"news,talk-show",6.7,...,nm0787754,self,unknown,[self - film critic],nm0787754,gene shalit,1926,2007,"actor,writer,archive_footage","tt0165042,tt0057758,tt0080249,tt0108734"
9,tt10603930,tvepisode,episode #1.49,episode #1.49,0.0,2013,2016,45.0,romance,6.7,...,nm10757708,writer,unknown,,nm10757708,krish jagarlamudi,1961,2007,"writer,producer","tt31925699,tt3667404,tt20836266,tt12234738"



DESCRIBE imdb_merged_cleaned_duckdb.parquet:
[('tconst', 'VARCHAR', 'YES', None, None, None), ('titleType', 'VARCHAR', 'YES', None, None, None), ('primaryTitle', 'VARCHAR', 'YES', None, None, None), ('originalTitle', 'VARCHAR', 'YES', None, None, None), ('isAdult', 'DOUBLE', 'YES', None, None, None), ('startYear', 'BIGINT', 'YES', None, None, None), ('endYear', 'BIGINT', 'YES', None, None, None), ('runtimeMinutes', 'DOUBLE', 'YES', None, None, None), ('genres', 'VARCHAR', 'YES', None, None, None), ('averageRating', 'DOUBLE', 'YES', None, None, None), ('numVotes', 'BIGINT', 'YES', None, None, None), ('directors', 'VARCHAR', 'YES', None, None, None), ('writers', 'VARCHAR', 'YES', None, None, None), ('parentTconst', 'VARCHAR', 'YES', None, None, None), ('seasonNumber', 'BIGINT', 'YES', None, None, None), ('episodeNumber', 'BIGINT', 'YES', None, None, None), ('titleId', 'VARCHAR', 'YES', None, None, None), ('ordering', 'DOUBLE', 'YES', None, None, None), ('title', 'VARCHAR', 'YES', None, 

Unnamed: 0,null_count



Top 30 percent missing (approx):
tconst                         | nulls=         0 | pct= 0.000%
titleType                      | nulls=         0 | pct= 0.000%
primaryTitle                   | nulls=         0 | pct= 0.000%
originalTitle                  | nulls=         0 | pct= 0.000%
isAdult                        | nulls=         0 | pct= 0.000%
startYear                      | nulls=         0 | pct= 0.000%
endYear                        | nulls=         0 | pct= 0.000%
runtimeMinutes                 | nulls=         0 | pct= 0.000%
genres                         | nulls=         0 | pct= 0.000%
averageRating                  | nulls=         0 | pct= 0.000%
numVotes                       | nulls=         0 | pct= 0.000%
directors                      | nulls=         0 | pct= 0.000%
writers                        | nulls=         0 | pct= 0.000%
parentTconst                   | nulls=         0 | pct= 0.000%
seasonNumber                   | nulls=         0 | pct= 0.000%
episod

Unnamed: 0,averageRating



Non-castable examples for numVotes (up to 50 rows):


Unnamed: 0,numVotes



Non-castable examples for runtimeMinutes (up to 50 rows):


Unnamed: 0,runtimeMinutes



Non-castable examples for startYear (up to 50 rows):


Unnamed: 0,startYear


In [None]:
import duckdb

# The URL to your cleaned file
url = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-26_112727_UTC/imdb_merged_cleaned_duckdb.parquet"

# Connect to an in-memory database
con = duckdb.connect(database=':memory:')

# 1. Install and load the httpfs extension
# This is required to read files from URLs
print("Loading httpfs extension...")
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")

# 2. Create a VIEW (this is instant and uses no memory)
# This just tells DuckDB where the file is. It doesn't download it.
print(f"Registering URL as 'imdb_cleaned' view...")
con.execute(f"CREATE OR REPLACE VIEW imdb_cleaned AS SELECT * FROM '{url}'")

print("\n✅ Done! The view 'imdb_cleaned' is ready to be queried.")

# ==========================================================
#  NOW YOU CAN QUERY IT SAFELY:
# ==========================================================

# Example 1: Get 10 rows to see the columns
print("\n--- Example 1: Grabbing 10 rows ---")
df_sample = con.sql("SELECT * FROM imdb_cleaned LIMIT 10").df()
print(df_sample)


# Example 2: Run an aggregation
# DuckDB does the heavy work, you just get the small result.
print("\n--- Example 2: Running a safe aggregation ---")
query = """
SELECT 
    titleType, 
    COUNT(*) as total_rows
FROM imdb_cleaned 
GROUP BY titleType
"""
df_agg = con.sql(query).df()

print(df_agg)

# You can keep using 'con' to run any query you want on the 'imdb_cleaned' view
# con.close() # Close it when you're all done

# Checking how our cleaned data looks like on a sample cleaned data

In [1]:
import pandas as pd

url = "https://workspace4824871889.blob.core.windows.net/azureml-blobstore-84f516da-0fe5-4f33-8f3c-f18ec8e2b4f7/UI/2025-10-26_114928_UTC/imdb_merged_cleaned_duckdb.parquet"
df = pd.read_parquet(url, engine="pyarrow")

print(df.head())
print('\n', df.info())


       tconst  titleType          primaryTitle       originalTitle  isAdult  \
0   tt0207871   tvSeries             buccaneer           Buccaneer      0.0   
1   tt0118694      movie  in the mood for love    Fa yeung nin wah      0.0   
2  tt13586826  tvEpisode    allumer le camping  Allumer le Camping      0.0   
3   tt2226407      movie         the landlords     Padroni di casa      0.0   
4  tt28636869  tvEpisode          episode #2.4        Episode #2.4      0.0   

   startYear endYear  runtimeMinutes                  genres  averageRating  \
0     1980.0    None            50.0         adventure,drama            6.8   
1     2000.0    None            98.0           drama,romance            8.0   
2     2020.0    None            45.0                  comedy            5.4   
3     2012.0    None            90.0                   drama            6.2   
4     2023.0    None             NaN  action,adventure,drama            8.0   

   ...  ordering_1     nconst  category           