In [17]:
import pandas as pd

def _replace_na(df: pd.DataFrame) -> pd.DataFrame:
    return df.replace(r"\N", pd.NA)

def clean_title_basics(df: pd.DataFrame) -> pd.DataFrame:
    df = _replace_na(df)
    df["startYear"] = pd.to_numeric(df["startYear"], errors="coerce")
    df["endYear"] = pd.to_numeric(df["endYear"], errors="coerce")
    df["runtimeMinutes"] = pd.to_numeric(df["runtimeMinutes"], errors="coerce")
    df["isAdult"] = df["isAdult"].astype("Int64")
    df["genres"] = df["genres"].str.split(",")
    return df

def clean_title_ratings(df: pd.DataFrame) -> pd.DataFrame:
    df = _replace_na(df)
    df["averageRating"] = pd.to_numeric(df["averageRating"], errors="coerce")
    df["numVotes"] = pd.to_numeric(df["numVotes"], errors="coerce")
    return df

def clean_title_crew(df: pd.DataFrame) -> pd.DataFrame:
    df = _replace_na(df)
    df["directors"] = df["directors"].str.split(",")
    df["writers"] = df["writers"].str.split(",")
    return df

def clean_name_basics(df: pd.DataFrame) -> pd.DataFrame:
    df = _replace_na(df)
    df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce")
    df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce")
    df["primaryProfession"] = df["primaryProfession"].str.split(",")
    df["knownForTitles"] = df["knownForTitles"].str.split(",")
    return df


In [18]:
from pathlib import Path
from imdb.utils import load_tsv
from imdb.transform import (
    clean_title_basics,
    clean_title_ratings,
    clean_title_crew,
    clean_name_basics
)
import pandas as pd

def clean_and_save(filename: str, cleaner, output_name: str):
    df = load_tsv(Path(filename))
    df_clean = cleaner(df)
    out_path = Path("data/processed") / f"{output_name}.parquet"
    df_clean.to_parquet(out_path, index=False)
    return df_clean

df_basics  = clean_and_save("title.basics.tsv.gz",  clean_title_basics,  "title_basics")
df_ratings = clean_and_save("title.ratings.tsv.gz", clean_title_ratings, "title_ratings")
df_crew    = clean_and_save("title.crew.tsv.gz",    clean_title_crew,    "title_crew")
df_names   = clean_and_save("name.basics.tsv.gz",   clean_name_basics,   "name_basics")
