In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("netflix1.csv")

# Preview raw data
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


In [16]:
df.describe()

  df.describe()


Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
count,8790,8790,8790,8790,8790,8790,8790,8790,8790,8790
unique,8790,2,8781,4526,86,1713,74,14,220,513
top,s1,movie,esperando la carroza,not given,united states,2020-01-01,1970-01-01 00:00:00.000002018,tv-ma,1 season,dramas|international movies
freq,1,6126,2,2588,3240,110,1146,3205,1791,362
first,,,,,,,1970-01-01 00:00:00.000001925,,,
last,,,,,,,1970-01-01 00:00:00.000002021,,,


In [3]:
# Dataset info
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8790 non-null   object
 1   type          8790 non-null   object
 2   title         8790 non-null   object
 3   director      8790 non-null   object
 4   country       8790 non-null   object
 5   date_added    8790 non-null   object
 6   release_year  8790 non-null   int64 
 7   rating        8790 non-null   object
 8   duration      8790 non-null   object
 9   listed_in     8790 non-null   object
dtypes: int64(1), object(9)
memory usage: 686.8+ KB


In [4]:
# Check missing values
df.isnull().sum()


show_id         0
type            0
title           0
director        0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
dtype: int64

In [5]:
# Remove duplicates
df.drop_duplicates(inplace=True)

print("Duplicates removed.")


Duplicates removed.


In [6]:
# Categorical columns (text)
categorical_cols = df.select_dtypes(include=['object']).columns

# Fill missing text values with "Unknown"
for col in categorical_cols:
    df[col].fillna("Unknown", inplace=True)


# Numerical columns (numbers)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Fill missing numeric values with median to avoid skew
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

In [8]:
# Identify possible date columns (change names as per dataset)
date_cols = ["date_added", "release_year"]

# Convert them into proper datetime format
# errors='coerce' converts bad dates into NaT instead of crashing
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')



In [11]:
# Convert text to lowercase and remove leading/trailing spaces
for col in categorical_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()


In [12]:
multi_cols = ["cast", "country", "listed_in"]  # update as needed

for col in multi_cols:
    if col in df.columns:
        df[col] = df[col].str.replace(", ", "|")

In [13]:
# Standardize the "type" column if present (movie / tv show)
if "type" in df.columns:
    df["type"] = df["type"].replace({
        "Movie": "movie",
        "TV Show": "tv show"
    })

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8790 non-null   object        
 1   type          8790 non-null   object        
 2   title         8790 non-null   object        
 3   director      8790 non-null   object        
 4   country       8790 non-null   object        
 5   date_added    8790 non-null   object        
 6   release_year  8790 non-null   datetime64[ns]
 7   rating        8790 non-null   object        
 8   duration      8790 non-null   object        
 9   listed_in     8790 non-null   object        
dtypes: datetime64[ns](1), object(9)
memory usage: 755.4+ KB


In [15]:
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,movie,dick johnson is dead,kirsten johnson,united states,2021-09-25,1970-01-01 00:00:00.000002020,pg-13,90 min,documentaries
1,s3,tv show,ganglands,julien leclercq,france,2021-09-24,1970-01-01 00:00:00.000002021,tv-ma,1 season,crime tv shows|international tv shows|tv actio...
2,s6,tv show,midnight mass,mike flanagan,united states,2021-09-24,1970-01-01 00:00:00.000002021,tv-ma,1 season,tv dramas|tv horror|tv mysteries
3,s14,movie,confessions of an invisible girl,bruno garotti,brazil,2021-09-22,1970-01-01 00:00:00.000002021,tv-pg,91 min,children & family movies|comedies
4,s8,movie,sankofa,haile gerima,united states,2021-09-24,1970-01-01 00:00:00.000001993,tv-ma,125 min,dramas|independent movies|international movies


In [17]:
# Export cleaned file for further analysis or ML modeling
df.to_csv("netflix_cleaned.csv", index=False)

print("Cleaned dataset saved as 'netflix_cleaned.csv'.")

Cleaned dataset saved as 'netflix_cleaned.csv'.
