In [7]:
import pandas as pd
import numpy as np

INPUT = "netflix_data.csv"
OUTPUT = "netflix_data_cleaned.csv"

try:
    df = pd.read_csv(INPUT)
    print("File loaded with default encoding.")
except:
    df = pd.read_csv(INPUT, encoding="latin1")
    print("File loaded with latin1 encoding.")

print("Initial shape:", df.shape)

df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({"": np.nan, "nan": np.nan, "none": np.nan, "na": np.nan})
    df[col] = df[col].where(df[col].isnull(), df[col].str.lower())

if "country" in df.columns:
    def clean_country(x):
        if pd.isna(x): 
            return x
        x = x.strip().lower()
        if x in ("usa", "us", "u.s.", "u.s.a"):
            return "united states"
        if x in ("uk", "england"):
            return "united kingdom"
        return x
    df["country"] = df["country"].apply(clean_country)

date_cols = [c for c in df.columns if "date" in c or "added" in c]
for c in date_cols:
    df[c] = pd.to_datetime(df[c], dayfirst=True, errors="coerce")
    print("Parsed date:", c)

for c in df.columns:
    if pd.api.types.is_datetime64_any_dtype(df[c]):
        df[c + "_clean"] = df[c].dt.strftime("%d-%m-%Y")
        df[c + "_clean"] = df[c + "_clean"].replace("NaT", np.nan)
        print("Created date_clean column:", c + "_clean")

print("\nMissing before filling:")
print(df.isnull().sum().sort_values(ascending=False).head(20))

for c in df.columns:
    if df[c].isnull().sum() > 0:
        if pd.api.types.is_numeric_dtype(df[c]):
            df[c] = df[c].fillna(df[c].median())
            print("Filled numeric:", c)
        elif pd.api.types.is_datetime64_any_dtype(df[c]):
            mode_vals = df[c].mode(dropna=True)
            if not mode_vals.empty:
                df[c] = df[c].fillna(mode_vals.iloc[0])
                print("Filled date:", c)
        else:
            mode_vals = df[c].mode(dropna=True)
            if not mode_vals.empty:
                df[c] = df[c].fillna(mode_vals.iloc[0])
                print("Filled text:", c)
            else:
                df[c] = df[c].fillna("unknown")
                print("Filled text with unknown:", c)

before = len(df)
df = df.drop_duplicates()
print("Removed duplicates:", before - len(df))

for c in df.columns:
    if df[c].dtype == "object":
        s = df[c].str.replace(",", "", regex=False)
        converted = pd.to_numeric(s, errors="coerce")
        if converted.notnull().sum() > len(df) * 0.5:
            df[c] = converted
            print("Converted to numeric:", c)

if "duration" in df.columns:
    ex = df["duration"].astype(str).str.extract(r'(?P<value>\d+)\s*(?P<unit>\w+)?', expand=True)
    df["duration_value"] = pd.to_numeric(ex["value"], errors="coerce").astype("Int64")
    df["duration_unit"] = ex["unit"].fillna("unknown").str.lower()
    print("Extracted duration fields.")

print("\nFinal shape:", df.shape)
df.to_csv(OUTPUT, index=False)
print("Saved cleaned file to:", OUTPUT)


File loaded with default encoding.
Initial shape: (7787, 11)


  df[c] = pd.to_datetime(df[c], dayfirst=True, errors="coerce")


Parsed date: date_added
Created date_clean column: date_added_clean

Missing before filling:
director            2389
cast                 718
country              507
date_added            10
date_added_clean      10
show_id                0
type                   0
title                  0
release_year           0
duration               0
description            0
genre                  0
dtype: int64
Filled text: director
Filled text: cast
Filled text: country
Filled date: date_added
Filled text: date_added_clean
Removed duplicates: 0
Extracted duration fields.

Final shape: (7787, 14)
Saved cleaned file to: netflix_data_cleaned.csv


In [2]:
import pandas as pd


df = pd.read_csv("netflix_data_cleaned.csv")

print("=== DATASET FEATURES ===\n")


print("Shape of dataset (rows, columns):", df.shape, "\n")


print("Column Names:")
print(list(df.columns), "\n")

print("Data Types:")
print(df.dtypes, "\n")


print("Missing Values in Each Column:")
print(df.isnull().sum(), "\n")

print("Basic Summary Statistics:")
print(df.describe(include="all"), "\n")

# preview first 5 rows
print("First 5 Rows of Dataset:")
print(df.head())


=== DATASET FEATURES ===

Shape of dataset (rows, columns): (7787, 14) 

Column Names:
['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'duration', 'description', 'genre', 'date_added_clean', 'duration_value', 'duration_unit'] 

Data Types:
show_id             object
type                object
title               object
director            object
cast                object
country             object
date_added          object
release_year         int64
duration             int64
description         object
genre               object
date_added_clean    object
duration_value       int64
duration_unit       object
dtype: object 

Missing Values in Each Column:
show_id             0
type                0
title               0
director            0
cast                0
country             0
date_added          0
release_year        0
duration            0
description         0
genre               0
date_added_clean    0
duration_value      0
duratio