In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('netflix_titles.csv') 
df.head()     

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
#Describing the Data
df.describe()


Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [10]:
#Missing Values:
df.isna().sum()


show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [11]:
df.notna().sum()


show_id         8807
type            8807
title           8807
director        6173
cast            7982
country         7976
date_added      8797
release_year    8807
rating          8803
duration        8804
listed_in       8807
description     8807
dtype: int64

In [12]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [6]:
# 1. Remove duplicate rows
df = df.drop_duplicates()
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [13]:
# 2. Remove rows missing essential fields
df = df.dropna(subset=['title', 'type', 'release_year'])

In [14]:
# 3. Clean categorical columns (strip whitespace, title case, blank to NaN)
cat_cols = ['type', 'rating', 'country', 'director', 'cast', 'listed_in']
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).replace(r'^\s*$', np.nan, regex=True)
        df[col] = df[col].apply(lambda x: x.strip().title() if isinstance(x, str) else x)

In [15]:
# 4. Convert release_year to numeric, and date_added to datetime
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['year_added'] = df['date_added'].dt.year


In [16]:
# 5. Extract duration (minutes) for movies
if 'duration' in df.columns:
    movie_mask = df['type'].str.lower() == 'movie'
    df.loc[movie_mask, 'duration_mins'] = df.loc[movie_mask, 'duration'].str.extract(r'(\d+)').astype(float)

In [17]:
# 6. Fill missing values
df['country'] = df['country'].fillna('Unknown')
df['rating'] = df['rating'].fillna('Not Rated')
df['listed_in'] = df['listed_in'].fillna('Unknown Genre')

In [18]:
# 7. Replace any remaining blank strings with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

In [19]:
# 8. Remove unrealistic release years
current_year = pd.Timestamp.now().year
df = df[(df['release_year'] >= 1900) & (df['release_year'] <= current_year)]

In [20]:
# 9. Save cleaned data
df.to_csv('netflix_titles_cleaned.csv', index=False)
print("✅ Cleaned file saved as netflix_titles_cleaned.csv")


✅ Cleaned file saved as netflix_titles_cleaned.csv


In [21]:
# 10. Quick summary
print(f"Total records: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"Missing values per column:\n{df.isnull().sum()}")
print("\nSummary statistics:")
print(df.describe(include='all'))

Total records: 8807
Columns: ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'year_added', 'duration_mins']
Missing values per column:
show_id             0
type                0
title               0
director            0
cast                0
country             0
date_added         98
release_year        0
rating              0
duration            3
listed_in           0
description         0
year_added         98
duration_mins    8807
dtype: int64

Summary statistics:
       show_id   type                 title director  cast        country  \
count     8807   8807                  8807     8807  8807           8807   
unique    8807      2                  8807     4527  7693            749   
top         s1  Movie  Dick Johnson Is Dead      Nan   Nan  United States   
freq         1   6131                     1     2634   825           2818   
mean       NaN    NaN                   NaN   