In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

# Understanding the Dataset

In [2]:
df = pd.read_csv('./data/netflix_titles.csv')
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [4]:
df.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [5]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

Director, cast, country, rating, duration, date_added

# Step 2: Data Cleaning

In [6]:
df_copy = df

In [7]:
df_copy['director'] = df_copy['director'].fillna('Unknown')
df_copy['cast'] = df_copy['cast'].fillna('Unknown')
df_copy['country'] = df_copy['country'].fillna('Unknown')

In [8]:
df_copy.isnull().sum()

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      10
release_year     0
rating           4
duration         3
listed_in        0
description      0
dtype: int64

In [9]:
df_copy['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)

In [10]:
valid_categories = {
    'PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR'
}

# Replace invalid entries with NaN (or handle them separately)
df['rating'] = df['rating'].apply(lambda x: x if x in valid_categories else None)
df_copy['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', None, 'NR', 'TV-Y7-FV', 'UR'], dtype=object)

In [11]:
# removing NONE value
mode_value = df_copy[df_copy['rating'].notna()]['rating'].mode()[0]
df_copy['rating'] = df_copy['rating'].fillna(mode_value)
df_copy['rating'].isnull().sum()  # Should print 0 if all missing values are replaced

0

In [12]:
df.isnull().sum()

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      10
release_year     0
rating           0
duration         3
listed_in        0
description      0
dtype: int64

# Handling Data

In [13]:
# Step 3: Extract Month, Date, and Year
df_copy['month'] = df_copy['date_added'].str.split(' ').str[0]
df_copy['date'] = df_copy['date_added'].str.split(' ').str[1]
df_copy['year'] = df_copy['date_added'].str.split(',').str[1]

# Apply replace to each value in the 'date' column only if it's a string
df_copy['date'] = df_copy['date'].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

# filling nan values
df_copy['month'].fillna(0, inplace=True)
df_copy['date'].fillna(0, inplace=True)
df_copy['year'].fillna(0, inplace=True)

# Step 4: Convert Month Names to Numbers & Clean Data
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df_copy['month'] = df_copy['month'].map(month_map)
df_copy['date'] = df_copy['date'].apply(lambda x: '0' if any(month in str(x) for month in month_map) else x)
df_copy['date'] = pd.to_numeric(df_copy['date'], errors='coerce').fillna(0).astype(int)
df_copy['year'] = pd.to_numeric(df_copy['year'], errors='coerce').fillna(0).astype(int)
df_copy.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,month,date,year
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",9.0,25,2021
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",9.0,24,2021


In [14]:
df_copy['month'] = df_copy['month'].fillna(0)

In [15]:
df_copy.isnull().sum()

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      10
release_year     0
rating           0
duration         3
listed_in        0
description      0
month            0
date             0
year             0
dtype: int64

In [16]:
# now no use od date_added column
df_copy.drop('date_added',axis=1,inplace=True)

## Movie Duration

In [17]:
df_copy[df_copy['duration'].isnull()]

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,listed_in,description,month,date,year
5541,s5542,Movie,Louis C.K. 2017,Louis C.K.,Louis C.K.,United States,2017,TV-MA,,Movies,"Louis C.K. muses on religion, eternal love, gi...",4.0,4,2017
5794,s5795,Movie,Louis C.K.: Hilarious,Louis C.K.,Louis C.K.,United States,2010,TV-MA,,Movies,Emmy-winning comedy writer Louis C.K. brings h...,9.0,16,2016
5813,s5814,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,Louis C.K.,United States,2015,TV-MA,,Movies,The comic puts his trademark hilarious/thought...,8.0,15,2016


In [18]:
df_copy.loc[5541, 'duration'] = '74 min'
df_copy.loc[5794, 'duration'] = '84 min'
df_copy.loc[5813, 'duration'] = '66 min'

In [19]:
df_copy['movie_duration'] = df_copy['duration'].apply(lambda x: x if 'min' in str(x) else np.nan)
df_copy['season_duration'] = df_copy['duration'].apply(lambda x: x if 'Season' in str(x) else np.nan)
df_copy.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,listed_in,description,month,date,year,movie_duration,season_duration
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",9.0,25,2021,90 min,
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",9.0,24,2021,,2 Seasons


In [20]:
# For movie_duration: Remove ' min' and convert to numeric, then fill NaN with 0
df_copy['movie_duration'] = df_copy['movie_duration'].astype(str).str.replace(' min', '', regex=False)
df_copy['movie_duration'] = pd.to_numeric(df_copy['movie_duration'], errors='coerce')
df_copy['movie_duration'] = df_copy['movie_duration'].fillna(0).astype(int)

# For season_duration: Remove ' Seasons' and convert to numeric, then fill NaN with 0
df_copy['season_duration'] = df_copy['season_duration'].astype(str).str.replace(' Seasons', '', regex=False)
df_copy['season_duration'] = pd.to_numeric(df_copy['season_duration'], errors='coerce')
df_copy['season_duration'] = df_copy['season_duration'].fillna(0).astype(int)

# Check the DataFrame null sum
df_copy.isnull().sum()


show_id            0
type               0
title              0
director           0
cast               0
country            0
release_year       0
rating             0
duration           0
listed_in          0
description        0
month              0
date               0
year               0
movie_duration     0
season_duration    0
dtype: int64

In [21]:
df_copy.drop('release_year',axis=1,inplace=True)
df_copy.drop('duration',axis=1,inplace=True)

In [22]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   show_id          8807 non-null   object 
 1   type             8807 non-null   object 
 2   title            8807 non-null   object 
 3   director         8807 non-null   object 
 4   cast             8807 non-null   object 
 5   country          8807 non-null   object 
 6   rating           8807 non-null   object 
 7   listed_in        8807 non-null   object 
 8   description      8807 non-null   object 
 9   month            8807 non-null   float64
 10  date             8807 non-null   int32  
 11  year             8807 non-null   int32  
 12  movie_duration   8807 non-null   int32  
 13  season_duration  8807 non-null   int32  
dtypes: float64(1), int32(4), object(9)
memory usage: 825.8+ KB


In [23]:
df_copy.isnull().sum()

show_id            0
type               0
title              0
director           0
cast               0
country            0
rating             0
listed_in          0
description        0
month              0
date               0
year               0
movie_duration     0
season_duration    0
dtype: int64

In [24]:
df_copy.to_csv('cleaned_data.csv', index=False)