# import pandas library as pd

In [1]:
import pandas as pd


# Load Dataset

In [2]:
df = pd.read_csv("/content/netflix_titles.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/netflix_titles.csv'

In [None]:
df.head()

Unnamed: 0,Show_Id,Type,Title,Director,Cast,Country,Date_Added,Release_Year,Rating,Duration,Listed_In,Description,duration_num,duration_unit
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",90.0,minutes
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2.0,seasons
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,1.0,seasons
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",1.0,seasons
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2.0,seasons


# Clean column names
makes columns uniform (no spaces, all lowercase) so later code is reliable.

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_', regex=False)


# Inspect missing values
to understand which columns need attention.

In [None]:
print(df.isnull().sum())


show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added       0
release_year     0
rating           0
duration         0
listed_in        0
description      0
duration_num     3
duration_unit    0
dtype: int64


#  Trim whitespace for string/object columns
leading/trailing whitespace causes apparent duplicates/inconsistent text.

# Change every first letter of the column headers to capital

In [None]:

df.columns = df.columns.str.replace("_", " ").str.title().str.replace(" ", "_")

print(df.columns)


Index(['Show_Id', 'Type', 'Title', 'Director', 'Cast', 'Country', 'Date_Added',
       'Release_Year', 'Rating', 'Duration', 'Listed_In', 'Description',
       'Duration_Num', 'Duration_Unit'],
      dtype='object')


# Handle missing values (simple, conservative choices)
For textual fields it's common to fill missing with "Unknown"

In [None]:
for col in df.select_dtypes(include=['object', 'string']).columns:
    df[col] = df[col].fillna("Unknown")

# Parse dates and fill missing date_added
For rows where date_added is missing, approximate with Jan 1 of release_year:

In [None]:
df['Date_Added'] = pd.to_datetime(df['Date_Added'], errors='coerce')

In [None]:
missing_dates = df['Date_Added'].isna()
df.loc[missing_dates, 'Date_Added'] = pd.to_datetime(df.loc[missing_dates, 'Release_Year'].astype(str) + '-01-01', errors='coerce')

# Split duration into numeric value + unit
makes it easy to filter numeric durations (e.g., movies by runtime) and TV shows by seasons.

In [None]:
df['Duration'] = df['Duration'].astype('string').str.strip()
df['duration_num'] = df['Duration'].str.extract(r'(\d+)')
df['duration_num'] = pd.to_numeric(df['duration_num'], errors='coerce').astype('Int64')
df['duration_unit'] = df['Duration'].str.extract(r'([A-Za-z]+)')[0].str.lower()
unit_map = {
    'min': 'minutes', 'mins': 'minutes', 'minute': 'minutes', 'minutes': 'minutes',
    'season': 'seasons', 'seasons': 'seasons'
}
df['duration_unit'] = df['duration_unit'].map(unit_map).fillna(df['duration_unit']).fillna('unknown')

# Remove duplicates

In [None]:
df = df.drop_duplicates(subset=['Title', 'Type', 'Release_Year'], keep='first')

# Ensure good data types

In [None]:
df['Release_Year'] = df['Release_Year'].astype('Int64')

# Download cleaned dataset

In [None]:
df.to_csv("/content/netflix_titles.csv", index=False)


In [None]:
from google.colab import files


df.to_csv("/content/netflix_titles.csv", index=False)

files.download("/content/netflix_titles.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>