In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('netflix_titles.csv')

# 1️⃣ Check missing values BEFORE cleaning
print("\n🔍 Missing values before cleaning:\n", df.isnull().sum())




🔍 Missing values before cleaning:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [3]:
# Drop rows with missing 'title'
df.dropna(subset=['title'], inplace=True)


In [4]:
# 2️⃣ Fill missing values
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Not Available')
df['country'] = df['country'].fillna('Unknown')
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])
df['date_added'] = df['date_added'].ffill()

In [5]:
# 3️⃣ Remove duplicate rows
df.drop_duplicates(inplace=True)

In [6]:
# 4️⃣ Standardize text values
df['type'] = df['type'].str.strip().str.lower()
df['country'] = df['country'].str.strip().str.title()

In [7]:
# 5️⃣ Convert date formats
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [8]:
# 6️⃣ Rename columns (clean headers)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [9]:
# 7️⃣ Check and fix data types
print("\n🧠 Data types before conversion:\n", df.dtypes)
df['release_year'] = df['release_year'].astype(int)


🧠 Data types before conversion:
 show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


In [10]:
# Example: convert duration to number of minutes or seasons
# (Optional, but useful if analyzing durations)

# 8️⃣ Create new features
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df['actor_count'] = df['cast'].apply(lambda x: 0 if x == 'Not Available' else len(x.split(',')))

In [11]:
# ✅ Print final results

print("\n✅ Missing values after cleaning:\n", df.isnull().sum())
print("\n📊 Sample cleaned data:\n", df.head(3))
print("\n🔠 Column headers:\n", df.columns.tolist())
print("\n🧾 Data types after conversion:\n", df.dtypes)


✅ Missing values after cleaning:
 show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      88
release_year     0
rating           0
duration         3
listed_in        0
description      0
year_added      88
month_added     88
actor_count      0
dtype: int64

📊 Sample cleaned data:
   show_id     type                 title         director  \
0      s1    movie  Dick Johnson Is Dead  Kirsten Johnson   
1      s2  tv show         Blood & Water          Unknown   
2      s3  tv show             Ganglands  Julien Leclercq   

                                                cast        country  \
0                                      Not Available  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...        Unknown   

  date_added  release_year rating   duration  \
0 2021-09-25          2020  PG-13     90 min   
1 2021-09-24    