In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
df = pd.read_csv("/kaggle/input/netflix-movies-and-tv-shows/netflix_titles.csv")
df.head(2)

In [None]:
# taking info 
df.info()

In [None]:
# describing the data 
df.describe()

In [None]:
# Total NaN values
df.isnull().sum()

In [None]:
# Making a Copy 
df_copy = df

In [None]:
df_copy.head(2)

In [None]:
# Since we cannot know about all the directors,cast,country 
## So we fill invalid or something like that 
df_copy['director'] = df_copy['director'].fillna('Unknown')
df_copy['cast'] = df_copy['cast'].fillna('Unknown')
df_copy['country'] = df_copy['country'].fillna('Unknown')
df_copy.isnull().sum()

In [None]:
# Now we take Rating columns 
df_copy['rating'].unique()

In [None]:
# Define valid categories
valid_categories = [
    'PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 
    'TV-Y7', 'R', 'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR'
]

# Replace invalid entries with NaN (or handle them separately)
df['rating'] = df['rating'].apply(lambda x: x if x in valid_categories else None)
df_copy['rating'].unique()

In [None]:
# removing NONE value
mode_value = df_copy[df_copy['rating'].notna()]['rating'].mode()[0]
df_copy['rating'] = df_copy['rating'].fillna(mode_value)


In [None]:
df_copy['rating'].isnull().sum()

In [None]:
df_copy.isnull().sum()

In [None]:
df_copy.head(2)

In [None]:
# Create the month map to identify month names
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df_copy['month'] = df_copy['date_added'].str.split(' ').str[0]
df_copy['date'] = df_copy['date_added'].str.split(' ').str[1]
df_copy['year'] = df_copy['date_added'].str.split(',').str[1]

# Apply replace to each value in the 'date' column only if it's a string
df_copy['date'] = df_copy['date'].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

df_copy.head(2)


In [None]:
# filling nan values
df_copy['month'].fillna(0, inplace=True)
df_copy['date'].fillna(0, inplace=True)
df_copy['year'].fillna(0, inplace=True)


here we have separated date_added into month,date,year 
since there were months name present when we separated into dates columns so we replaced that cell to
0 and have used pd.to_numeric function to do this, why so?
because pd.to_numeric() is more forgiving and handles non-numeric values more gracefully, making it better suited for this type of data cleaning task.pd.to_numeric() is preferred in this case because it is more flexible and robust than astype() when handling non-numeric values.

In [None]:
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df_copy['month'] = df_copy['month'].map(month_map)
df_copy['date'] = df_copy['date'].apply(lambda x: '0' if any(month in str(x) for month in month_map) else x)
df_copy['date'] = pd.to_numeric(df_copy['date'], errors='coerce').fillna(0).astype(int)
df_copy['year'] = pd.to_numeric(df_copy['year'], errors='coerce').fillna(0).astype(int)
df_copy.head(2)


In [None]:
df_copy['month'] = df_copy['month'].fillna(0)

In [None]:
df_copy.isnull().sum()

In [None]:
# now no use od date_added column 
df_copy.drop('date_added',axis=1,inplace=True)

In [None]:
df_copy.head(2)

In [None]:
df_copy[df_copy['duration'].isnull()]

In [None]:
df_copy.loc[5541, 'duration'] = '74 min'
df_copy.loc[5794, 'duration'] = '84 min'
df_copy.loc[5813, 'duration'] = '66 min'

In [None]:
df_copy['movie_duration'] = df_copy['duration'].apply(lambda x: x if 'min' in str(x) else np.nan)
df_copy['season_duration'] = df_copy['duration'].apply(lambda x: x if 'Season' in str(x) else np.nan)
df_copy.head(2)

In [None]:
# For movie_duration: Remove ' min' and convert to numeric, then fill NaN with 0
df_copy['movie_duration'] = df_copy['movie_duration'].astype(str).str.replace(' min', '', regex=False)
df_copy['movie_duration'] = pd.to_numeric(df_copy['movie_duration'], errors='coerce')
df_copy['movie_duration'] = df_copy['movie_duration'].fillna(0).astype(int)

# For season_duration: Remove ' Seasons' and convert to numeric, then fill NaN with 0
df_copy['season_duration'] = df_copy['season_duration'].astype(str).str.replace(' Seasons', '', regex=False)
df_copy['season_duration'] = pd.to_numeric(df_copy['season_duration'], errors='coerce')
df_copy['season_duration'] = df_copy['season_duration'].fillna(0).astype(int)

# Check the DataFrame null sum
df_copy.isnull().sum()


In [None]:
df_copy.drop('release_year',axis=1,inplace=True)

In [None]:
df_copy.drop('duration',axis=1,inplace=True)

In [None]:
df_copy.info()

In [None]:
df_copy.isnull().sum()

In [None]:
df_copy.to_csv('cleaned_data.csv', index=False)

# EDA

In [None]:
numeric_features = [feature for feature in df_copy.columns if df_copy[feature].dtype != 'O']
categorical_features = [feature for feature in df_copy.columns if df_copy[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

In [None]:
## Proportion of count data on categorical columns
for col in categorical_features:
    print(df[col].value_counts(normalize=True)*100)
    print('---------------------------')

In [None]:
## Proportion of count data on numerical columns
plt.figure(figsize=(15, 15))
plt.suptitle('Univariate Analysis of Numerical Features', fontsize=20, fontweight='bold', alpha=0.8, y=1.)

for i in range(0, len(numeric_features)):
    plt.subplot(5, 3, i+1)
    sns.kdeplot(x=df_copy[numeric_features[i]],shade=True, color='r')
    plt.xlabel(numeric_features[i])
    plt.tight_layout()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='type', palette='viridis')
plt.title('Count of Movies vs TV Shows', fontsize=16)
plt.xlabel('Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
genres = df['listed_in'].str.split(', ').explode()
top_genres = genres.value_counts().head(10)
sns.barplot(x=top_genres.values, y=top_genres.index, palette='coolwarm')
plt.title('Top 10 Genres', fontsize=16)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Genre', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df[df['type'] == 'Movie']['movie_duration'], bins=30, kde=True, color='blue')
plt.title('Distribution of Movie Durations', fontsize=16)
plt.xlabel('Duration (minutes)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df[df['type'] == 'TV Show']['season_duration'], bins=30, kde=True, color='green')
plt.title('Distribution of TV Show Seasons', fontsize=16)
plt.xlabel('Number of Seasons', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
rating_counts = df['rating'].value_counts()
sns.barplot(x=rating_counts.index, y=rating_counts.values, palette='magma')
plt.title('Ratings Distribution', fontsize=16)
plt.xlabel('Rating', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.show()