In [2]:
from pathlib import Path
import re
import numpy as np
import pandas as pd
from typing import List, Optional
import io
import time

#Data directory
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

PROVIDER_FILES = {
    "Netflix": DATA_DIR / "netflix_titles.csv",
    "Amazon Prime": DATA_DIR / "amazon_prime_titles.csv",
    "Disney+": DATA_DIR / "disney_plus_titles.csv",
}
for k, v in PROVIDER_FILES.items():
    if not v.exists():
        raise FileNotFoundError(f"Missing file for {k}: {v}")
print("Provider files:", PROVIDER_FILES)
print("Done")

Provider files: {'Netflix': PosixPath('data/netflix_titles.csv'), 'Amazon Prime': PosixPath('data/amazon_prime_titles.csv'), 'Disney+': PosixPath('data/disney_plus_titles.csv')}
Done


In [3]:
def load_any_df(path: Path) -> pd.DataFrame:
    ext = path.suffix.lower()
    if ext == ".csv":
        return pd.read_csv(path)
    #JSON: try standard then JSON-lines
    try:
        return pd.read_json(path, lines=False)
    except ValueError:
        return pd.read_json(path, lines=True)

def to_snake(name: str) -> str:
    name = re.sub(r"[^\w]+", "_", name.strip())
    name = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", name)
    return name.lower().strip("_")

def get_col(df: pd.DataFrame, aliases: List[str]) -> Optional[str]:
    #Try snake-case alias match
    snake = {to_snake(c): c for c in df.columns}
    for a in aliases:
        if a in snake:
            return snake[a]
    #Fallback: case-insensitive plain match
    lowers = [al.replace("_", " ") for al in aliases]
    for c in df.columns:
        if c.lower() in lowers:
            return c
    return None

STANDARD_COLS = {
    "title": ["title", "show_title", "name"],
    "type": ["type", "content_type", "show_type"],
    "release_year": ["release_year", "year"],
    "genres": ["genres", "genre", "listed_in"],
    "imdb_rating": ["imdb_rating", "imdb_score", "score"],
    "imdb_votes": ["imdb_votes", "votes"],
    "content_rating": ["rating", "age_certification", "maturity_rating"],
}


def coerce_schema(df: pd.DataFrame, platform: str) -> pd.DataFrame:
    out = pd.DataFrame()
    for std, aliases in STANDARD_COLS.items():
        aliases_snake = [to_snake(a) for a in aliases]
        if std not in aliases_snake:
            aliases_snake.append(std)
        col = get_col(df, aliases_snake)
        out[std] = df[col] if col is not None else np.nan
    out["platform"] = platform    
    return out

In [4]:
dfs = []
for plat, path in PROVIDER_FILES.items():
    raw = load_any_df(path)
    std = coerce_schema(raw, plat)
    dfs.append(std)

print("Standardized shapes:")
for d in dfs:
    print(f"{d['platform'].iloc[0]} -> {d.shape}")
    

Standardized shapes:
Netflix -> (8807, 8)
Amazon Prime -> (9668, 8)
Disney+ -> (1450, 8)


In [5]:
def normalize_strings(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    for c in cols:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()
            df.loc[df[c].isin(["nan", "NaN", "None"]), c] = np.nan
    return df

def to_numeric(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def safe_split(x, seps=(",", "|", ";")):
    if pd.isna(x): return []
    if isinstance(x, list): return [str(i).strip() for i in x if str(i).strip()]
    s = str(x)
    for sep in seps:
        s = s.replace(sep, ",")
    return [i.strip() for i in s.split(",") if i.strip()]

def split_multivalue_cols(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    for c in cols:
        if c in df.columns:
            df[c] = df[c].apply(safe_split)
    return df

cleaned = []
for df in dfs:
    df = df.copy()
    df = normalize_strings(df, ["title","type","country","genres","director","cast","date_added","duration"])
    df = to_numeric(df, ["release_year","imdb_rating","imdb_votes"])
    df = split_multivalue_cols(df, ["country","genres","cast","director"])

    #Ensure genres is a list,default only if  missing
    if "genres" in df.columns:
        df["genres"] = df["genres"].apply(lambda g: g if isinstance(g, list) and len(g) else ["Unknown"])

    cleaned.append(df)

print("Post-clean dtypes:")
cleaned[0].dtypes.head(12)
print (cleaned)


Post-clean dtypes:
[                      title     type  release_year  \
0      Dick Johnson Is Dead    Movie          2020   
1             Blood & Water  TV Show          2021   
2                 Ganglands  TV Show          2021   
3     Jailbirds New Orleans  TV Show          2021   
4              Kota Factory  TV Show          2021   
...                     ...      ...           ...   
8802                 Zodiac    Movie          2007   
8803            Zombie Dumb  TV Show          2018   
8804             Zombieland    Movie          2009   
8805                   Zoom    Movie          2006   
8806                 Zubaan    Movie          2015   

                                                 genres  imdb_rating  \
0                                       [Documentaries]          NaN   
1     [International TV Shows, TV Dramas, TV Mysteries]          NaN   
2     [Crime TV Shows, International TV Shows, TV Ac...          NaN   
3                              [Docuseries,

In [6]:
def _safe_median(s: pd.Series):
    s2 = s.dropna()
    return s2.median() if not s2.empty else np.nan

def impute_values_safely(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

#Fill numerical values with medians if it exists
    for c in ["imdb_rating", "imdb_votes", "release_year"]:
        if c in df.columns:
            med = _safe_median(df[c])
            if not np.isnan(med):
                df[c] = df[c].fillna(med)

#List categoricals ensuring non-empty lists
    def ensure_list(val):
        if isinstance(val, list):
            return val if len(val) else ["Unknown"]
        if pd.isna(val):
            return ["Unknown"]
        return [str(val)]

    for c in ["country","genres","cast","director"]:
        if c in df.columns:
            df[c] = df[c].apply(ensure_list)

#Fill Scalar strings only when missing
    for c in ["title","type","date_added","duration"]:
        if c in df.columns:
            df[c] = df[c].where(df[c].notna(), "Unknown")

    return df

imputed = [impute_values_safely(df) for df in cleaned]


In [7]:
def _normalize_title_for_dupes(s: pd.Series) -> pd.Series:
    def norm(x: str) -> str:
        if pd.isna(x): return x
        x = str(x).strip().lower()
        x = re.sub(r"\s+", " ", x)
        x = re.sub(r"[^\w\s]", "", x)
        return x
    return s.astype(str).apply(norm)

def drop_dupes_robust(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["__norm_title__"] = _normalize_title_for_dupes(df["title"]) if "title" in df.columns else np.nan
    keys = ["__norm_title__"]
    for k in ["platform","type","release_year"]:
        if k in df.columns:
            keys.append(k)
    before = len(df)
    df = df.drop_duplicates(subset=keys).reset_index(drop=True)
    after = len(df)
    print(f"De-duplicated: {before} → {after} (removed {before - after})")
    return df.drop(columns=["__norm_title__"])

deduped = [drop_dupes_robust(df) for df in imputed]


De-duplicated: 8807 → 8802 (removed 5)
De-duplicated: 9668 → 9659 (removed 9)
De-duplicated: 1450 → 1450 (removed 0)


In [8]:
#Grab IMDB data
IMDB_DIR = DATA_DIR / "imdb"
IMDB_DIR.mkdir(parents=True, exist_ok=True)

print("IMDb directory:", IMDB_DIR)
print("Found:", sorted(p.name for p in IMDB_DIR.glob("*.gz")))

IMDb directory: data/imdb
Found: ['title.akas.tsv.gz', 'title.basics.tsv.gz', 'title.ratings.tsv.gz']


In [9]:
#TAke only whats needed

use_basics  = ["tconst","titleType","primaryTitle","originalTitle","startYear"]
use_ratings = ["tconst","averageRating","numVotes"]

basics  = pd.read_csv(IMDB_DIR/"title.basics.tsv.gz",  sep="\t",
                      usecols=use_basics,  na_values="\\N", low_memory=False)
ratings = pd.read_csv(IMDB_DIR/"title.ratings.tsv.gz", sep="\t",
                      usecols=use_ratings, na_values="\\N", low_memory=False)

#Keep only movies & series that have ratings
basics = basics[basics["titleType"].isin(["movie","tvSeries","tvMiniSeries"])].copy()
imdb   = basics.merge(ratings, on="tconst", how="inner")

#dtypes
imdb["startYear"]     = pd.to_numeric(imdb["startYear"], errors="coerce").astype("Int64")
imdb["averageRating"] = pd.to_numeric(imdb["averageRating"], errors="coerce")
imdb["numVotes"]      = pd.to_numeric(imdb["numVotes"], errors="coerce")

print("Done")

Done


In [10]:
#Normalize titles and map types

def norm_title_series(s: pd.Series) -> pd.Series:
    def _n(x):
        if pd.isna(x): return None
        x = str(x).strip().lower()
        x = re.sub(r"\(aka[^)]*\)|\[[^\]]*\]|\([^)]*\)", "", x)  
        x = re.sub(r"[^\w\s]", "", x)                           
        x = re.sub(r"\s+", " ", x).strip()
        x = re.sub(r"^(the|a|an)\s+", "", x)                    
        return x
    return s.apply(_n)

#left: merged dataset
combined_df = pd.concat(deduped, ignore_index=True)
left = combined_df.copy()
left["__norm_title"] = norm_title_series(left["title"])
left["__type_grp"] = (left["type"].astype(str).str.strip().str.lower().map(lambda t: "Movie" if t == "movie" else "TV"))

#right: IMDb
imdb["__norm_title_p"] = norm_title_series(imdb["primaryTitle"])
imdb["__norm_title_o"] = norm_title_series(imdb["originalTitle"])
imdb["__type_grp"]     = imdb["titleType"].map(lambda t: "Movie" if t=="movie" else "TV")

print("Done")

Done


In [11]:
#Start joining
cand = imdb[imdb["startYear"].notna()].copy()

cols = ["tconst","titleType","__type_grp","startYear","averageRating","numVotes"]

cand1 = cand[cols + ["__norm_title_p"]].rename(columns={"__norm_title_p":"__norm_title"})
cand2 = cand[cols + ["__norm_title_o"]].rename(columns={"__norm_title_o":"__norm_title"})

cand_all = (
    pd.concat([cand1, cand2], ignore_index=True)
      .dropna(subset=["__norm_title"])
      .drop_duplicates(subset=["tconst","__norm_title","startYear","__type_grp"])
)
_type_rank = {"tvSeries": 0, "tvMiniSeries": 1, "movie": 0}
cand_all["__type_rank"] = cand_all["titleType"].map(_type_rank).fillna(2)

cand_all = (
    cand_all.sort_values(
        ["__norm_title","startYear","__type_grp","__type_rank","numVotes","averageRating"],
        ascending=[True, True, True, True, False, False]
    )
    .drop_duplicates(subset=["__norm_title","startYear","__type_grp"], keep="first")
)
assert not cand_all.duplicated(subset=["__norm_title","startYear","__type_grp"]).any()
#speed filtering
_needed_years = set(left["release_year"].dropna().astype(int))
_needed_years = _needed_years | {y + 1 for y in _needed_years} | {y - 1 for y in _needed_years}
cand_all = cand_all[cand_all["startYear"].isin(_needed_years)]
cand_all = cand_all[cand_all["numVotes"].fillna(0) >= 50] 

#Minor categorical optimization
cand_all["__type_grp"] = cand_all["__type_grp"].astype("category")
left["__type_grp"] = left["__type_grp"].astype("category")
m1 = left.merge(
    cand_all,
    left_on=["__norm_title","release_year","__type_grp"],
    right_on=["__norm_title","startYear","__type_grp"],
    how="left",
    validate="m:1"
)
print(f"Streaming titles that successfully matched an IMDb record: {m1['averageRating'].notna().mean()*100:.1f}%")

Streaming titles that successfully matched an IMDb record: 50.0%


In [12]:
#PAss 2 with 1 year bakcfill
t2_start = time.perf_counter()
print("Pass-2: starting ±1 year backfill...")

need_mask = m1["averageRating"].isna()

#Rows still missing
need = m1.loc[need_mask, ["__norm_title", "__type_grp", "release_year"]].copy()

#+1 year candidates
j1 = need.assign(release_year_shift=need["release_year"] + 1).merge(
    cand_all,
    left_on=["__norm_title", "__type_grp", "release_year_shift"],
    right_on=["__norm_title", "__type_grp", "startYear"],
    how="left",
    validate="m:1",
    suffixes=("", "_r"),
)

#-1 year candidates
j2 = need.assign(release_year_shift=need["release_year"] - 1).merge(
    cand_all,
    left_on=["__norm_title", "__type_grp", "release_year_shift"],
    right_on=["__norm_title", "__type_grp", "startYear"],
    how="left",
    validate="m:1",
    suffixes=("", "_r"),
)

#Pick best: prefer j1 when present else j2
r_fill = j1["averageRating"].combine_first(j2["averageRating"])
v_fill = j1["numVotes"].combine_first(j2["numVotes"])
t_fill = j1["tconst"].combine_first(j2["tconst"])

m2 = m1.copy()
m2.loc[need_mask, "averageRating"] = r_fill.values
m2.loc[need_mask, "numVotes"]      = v_fill.values
m2.loc[need_mask, "tconst"]        = t_fill.values

print(f"Coverage pass-2 (±1): {m2['averageRating'].notna().mean()*100:.1f}%")
print(f"Pass-2 done in {time.perf_counter() - t2_start:.1f}s")

Pass-2: starting ±1 year backfill...
Coverage pass-2 (±1): 58.7%
Pass-2 done in 0.6s


In [13]:
#Pass-3: AKAs same-year backfill
t3_start = time.perf_counter()
print("Pass-3: starting AKAs backfill...")

use_akas = ["titleId", "title", "region", "isOriginalTitle"]
akas = pd.read_csv(IMDB_DIR / "title.akas.tsv.gz", sep="\t", usecols=use_akas, na_values="\\N", low_memory=False)

akas["__norm_title"] = norm_title_series(akas["title"])
aka_join = (
    akas.dropna(subset=["__norm_title"])
        .merge(
            imdb[["tconst", "__type_grp", "startYear", "averageRating", "numVotes"]],
            left_on="titleId",
            right_on="tconst",
            how="inner",
        )[["__norm_title", "__type_grp", "startYear", "averageRating", "numVotes", "tconst"]]
        .sort_values(
            ["__norm_title", "__type_grp", "startYear", "numVotes", "averageRating"],
            ascending=[True, True, True, False, False]
        )
        .drop_duplicates(subset=["__norm_title", "__type_grp", "startYear"], keep="first")
)
_needed_years = set(m2["release_year"].dropna().astype(int))
aka_join = aka_join[aka_join["startYear"].isin(_needed_years)]


need_mask = m2["averageRating"].isna() & m2["__norm_title"].notna()
need = m2.loc[need_mask, ["__norm_title", "__type_grp", "release_year"]]

j_aka = need.merge(
    aka_join,
    left_on=["__norm_title", "__type_grp", "release_year"],
    right_on=["__norm_title", "__type_grp", "startYear"],
    how="left",
    validate="m:1",
)

m3 = m2.copy()
m3.loc[need_mask, "averageRating"] = j_aka["averageRating"].values
m3.loc[need_mask, "numVotes"]      = j_aka["numVotes"].values
m3.loc[need_mask, "tconst"]        = j_aka["tconst"].values

print(f"Coverage pass-3 (AKAs, same year): {m3['averageRating'].notna().mean()*100:.1f}%")
print(f"Pass-3 done in {time.perf_counter() - t3_start:.1f}s")

Pass-3: starting AKAs backfill...
Coverage pass-3 (AKAs, same year): 65.3%
Pass-3 done in 175.8s


In [14]:
#Use unicode add in coutnry lists
import unicodedata
t_ctry_start = time.perf_counter()
print("Countries: building country_map from AKAs...")

need_cols = {"titleId","title","region","isOriginalTitle"}
if not need_cols.issubset(akas.columns):
    use_akas = ["titleId", "title", "region", "isOriginalTitle"]
    akas = pd.read_csv(IMDB_DIR / "title.akas.tsv.gz", sep="\t",
                       usecols=use_akas, na_values="\\N", low_memory=False)
    
_needed_tconsts = set()
if "m3" in globals() and "tconst" in m3.columns:
    try:
        _needed_tconsts |= set(m3.loc[m3["country"].isna(), "tconst"].dropna().astype(str))
    except Exception:
        _needed_tconsts |= set(m3["tconst"].dropna().astype(str))
elif "m2" in globals() and "tconst" in m2.columns:
    _needed_tconsts |= set(m2["tconst"].dropna().astype(str))
if _needed_tconsts:
    akas = akas[akas["titleId"].isin(_needed_tconsts)]

akas = akas[akas["region"].notna()]
akas = akas[akas["region"].str.len().between(2, 3)]
akas["region"] = akas["region"].str.upper().astype("category")


#Normalizer
def _norm(s: str) -> str:
    if not isinstance(s, str): return ""
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s = s.lower().strip()
    s = re.sub(r"\(aka[^)]*\)|\[[^\]]*\]|\([^)]*\)", " ", s)
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"^(the|a|an)\s+", "", s)
    return s

#Strict original-title region
a = akas[["titleId","title","region","isOriginalTitle"]].copy()
a["isOriginalTitle"] = a["isOriginalTitle"].astype(str).isin(["1","True","true"])
p1 = (
    a.loc[a["isOriginalTitle"] & a["region"].notna(), ["titleId","region"]]
     .drop_duplicates(subset=["titleId"])
     .rename(columns={"titleId":"tconst","region":"country"})
)

#Region of AKA whose title == basics.originalTitle (normalized)
b = imdb[["tconst","originalTitle"]].copy()
b["original_norm"] = b["originalTitle"].astype(str).map(_norm)
ak = akas[["titleId","title","region"]].copy()
ak["title_norm"] = ak["title"].astype(str).map(_norm)

p2 = (
    b.merge(ak, left_on=["tconst","original_norm"], right_on=["titleId","title_norm"], how="left")
     .dropna(subset=["region"])
     .sort_values(["tconst"])  
     .drop_duplicates(subset=["tconst"])
     .rename(columns={"region":"country"})
     [["tconst","country"]]
)

#Modal region
modal = (
    akas.loc[akas["region"].notna(), ["titleId","region"]]
        .groupby(["titleId","region"]).size().reset_index(name="n")
        .sort_values(["titleId","n"], ascending=[True, False])
        .groupby("titleId", as_index=False).first()[["titleId","region"]]
        .rename(columns={"titleId":"tconst","region":"country"})
)

#Combine priorities: P1 > P2 > P3
country_map = modal.set_index("tconst")
# overlay P2 then P1
country_map.update(p2.set_index("tconst"))
country_map.update(p1.set_index("tconst"))
country_map = country_map.reset_index()

#Clean
country_map["country"] = country_map["country"].astype(str).str.upper().str.strip()
country_map = country_map[country_map["country"].str.len() == 2].drop_duplicates("tconst")

print("country_map sizes — p1:", len(p1), "p2:", len(p2), "p3:", len(modal), "final:", len(country_map))

#Optimize merge types for speed/memory
country_map["country"] = country_map["country"].astype("category")

#Merge into the frames
m3 = m3.merge(country_map, on="tconst", how="left")
if "df" in globals() and "tconst" in df.columns:
    df = df.merge(country_map, on="tconst", how="left")
if "combined_df" in globals() and "tconst" in combined_df.columns:
    combined_df = combined_df.merge(country_map, on="tconst", how="left")
if "out" in globals() and "tconst" in out.columns:
    out = out.merge(country_map, on="tconst", how="left")

#Quick counters
def _nn(frame, name):
    try:
        print(f"{name} country non-null:", int(frame['country'].notna().sum()))
    except Exception:
        pass

_nn(m3, "m3")
if "df" in globals(): _nn(df, "df")
if "combined_df" in globals(): _nn(combined_df, "combined_df")
if "out" in globals(): _nn(out, "out")

print("Countries added ")
print(f"Countries step done in {time.perf_counter() - t_ctry_start:.1f}s")

Countries: building country_map from AKAs...


  .groupby(["titleId","region"]).size().reset_index(name="n")


country_map sizes — p1: 0 p2: 12123 p3: 12523 final: 12415
m3 country non-null: 12856
Countries added 
Countries step done in 5.6s


In [15]:
combined_df = pd.concat(deduped, ignore_index=True)

print("Combined shape:", combined_df.shape)
print("\nCounts by platform:")
print(combined_df["platform"].value_counts(dropna=False))

if "type" in combined_df.columns:
    print("\nCounts by type:")
    print(combined_df["type"].value_counts(dropna=False))

missing = (combined_df.isna().mean().sort_values(ascending=False) * 100).round(1)
print("\nTop missingness (%):")
print(missing.head(12))

tmp = combined_df.copy()
tmp["__norm_title__"] = _normalize_title_for_dupes(tmp["title"])
keys = [k for k in ["__norm_title__","platform","type","release_year"] if k in tmp.columns]
dups = tmp.duplicated(subset=keys).sum()
print("\nRemaining duplicates by robust key:", dups)

Combined shape: (19911, 8)

Counts by platform:
platform
Amazon Prime    9659
Netflix         8802
Disney+         1450
Name: count, dtype: int64

Counts by type:
type
Movie      14984
TV Show     4927
Name: count, dtype: int64

Top missingness (%):
imdb_votes        100.0
imdb_rating       100.0
content_rating      1.7
title               0.0
genres              0.0
release_year        0.0
type                0.0
platform            0.0
dtype: float64

Remaining duplicates by robust key: 0


In [19]:
out = m3.rename(columns={"averageRating": "imdb_rating", "numVotes": "imdb_votes"})
if out.columns.duplicated().any():
    out = out.loc[:, ~out.columns.duplicated(keep="last")]
    
#Make genres readable in CSV
if "genres" in out.columns:
    out["genres"] = out["genres"].apply(lambda xs: " | ".join(xs) if isinstance(xs, list) else (xs if isinstance(xs, str) else ""))

#Cast IMDb fields
for c in ("imdb_rating", "imdb_votes"):
    if c in out.columns:
        out[c] = pd.to_numeric(out[c], errors="coerce")

def _is_missing(s: pd.Series) -> pd.Series:
    return s.isna() | s.astype(str).str.fullmatch(r"\s*|unknown|none|nan", case=False)

_out = out.copy()

#Keep only valid country codes (2–3 letters)
if "country" in _out.columns:
    _out = _out[_out["country"].notna()]
    _out = _out[_out["country"].astype(str).str.fullmatch(r"[A-Z]{2,3}")]

req = ["title", "type", "platform", "genres"]
for c in req:
    if c in _out.columns:
        _out = _out[~_is_missing(_out[c])]

#Year must be present and plausible
if "release_year" in _out.columns:
    _out = _out[_out["release_year"].between(1900, 2100, inclusive="both")]

print(f"Filtered incomplete rows: {len(out)} -> {len(_out)} (removed {len(out)-len(_out)})")
out = _out

cols = [
    "title", "type", "release_year", "genres", "platform", "content_rating",
    "imdb_rating", "imdb_votes", "country", "tconst"
]

cols = [c for c in cols if c in out.columns]
out[cols].to_csv("clean_streaming_metadata.csv", index=False)

print("Clean dataset saved:", out.shape)


Filtered incomplete rows: 19911 -> 12855 (removed 7056)
Clean dataset saved: (12855, 15)


In [21]:
#Stats
df = pd.read_csv("clean_streaming_metadata.csv")

#General overview
print("Rows:", len(df))
print("Columns:", df.columns.tolist())

#Release year
print("\nRelease Year Stats:")
print(df['release_year'].describe())

#IMDb rating
print("\nIMDb Rating Stats:")
print(df['imdb_rating'].describe())

#Content type ratio
print("\nType distribution:")
print(df['type'].value_counts(normalize=True) * 100)

#Missing value percentages
print("\nMissing values (%):")
print(df.isnull().mean() * 100)



Rows: 12855
Columns: ['title', 'type', 'release_year', 'genres', 'platform', 'content_rating', 'imdb_rating', 'imdb_votes', 'country', 'tconst']

Release Year Stats:
count    12855.000000
mean      2009.938934
std         16.667641
min       1920.000000
25%       2009.000000
50%       2016.000000
75%       2019.000000
max       2021.000000
Name: release_year, dtype: float64

IMDb Rating Stats:
count    12855.000000
mean         6.223003
std          1.254114
min          1.000000
25%          5.500000
50%          6.400000
75%          7.100000
max          9.600000
Name: imdb_rating, dtype: float64

Type distribution:
type
Movie      78.016336
TV Show    21.983664
Name: proportion, dtype: float64

Missing values (%):
title             0.000000
type              0.000000
release_year      0.000000
genres            0.000000
platform          0.000000
content_rating    1.190198
imdb_rating       0.000000
imdb_votes        0.000000
country           0.000000
tconst            0.000000
dt

In [22]:
df.head(10)

Unnamed: 0,title,type,release_year,genres,platform,content_rating,imdb_rating,imdb_votes,country,tconst
0,Dick Johnson Is Dead,Movie,2020,Documentaries,Netflix,PG-13,7.4,7560.0,AU,tt11394180
1,Blood & Water,TV Show,2021,International TV Shows | TV Dramas | TV Mysteries,Netflix,TV-MA,6.7,4705.0,FR,tt9839146
2,Ganglands,TV Show,2021,Crime TV Shows | International TV Shows | TV A...,Netflix,TV-MA,7.2,4934.0,FR,tt13278100
3,Jailbirds New Orleans,TV Show,2021,Docuseries | Reality TV,Netflix,TV-MA,6.5,338.0,IN,tt15320436
4,Midnight Mass,TV Show,2021,TV Dramas | TV Horror | TV Mysteries,Netflix,TV-MA,7.7,163639.0,IN,tt10574558
5,My Little Pony: A New Generation,Movie,2021,Children & Family Movies,Netflix,PG,6.8,4977.0,GB,tt10101702
6,Sankofa,Movie,1993,Dramas | Independent Movies | International Mo...,Netflix,TV-MA,7.0,884.0,US,tt0108041
7,The Starling,Movie,2021,Comedies | Dramas,Netflix,PG-13,6.4,16631.0,IN,tt5164438
8,"Vendetta: Truth, Lies and The Mafia",TV Show,2021,Crime TV Shows | Docuseries | International TV...,Netflix,TV-MA,6.7,373.0,IT,tt14216574
9,Bangkok Breaking,TV Show,2021,Crime TV Shows | International TV Shows | TV A...,Netflix,TV-MA,5.9,461.0,GB,tt14202282
