In [175]:
import pandas as pd
import numpy as np


In [177]:
df=pd.read_csv('netflix_titles.csv')

In [178]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
release_year,8807.0,2014.180198,8.819312,1925.0,2013.0,2017.0,2019.0,2021.0


In [179]:
df.sample(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1249,s1250,Movie,Blanche Gardin: The All-Nighter,"Xavier Maingon, Marc-Antoine Hélard",Blanche Gardin,,"March 1, 2021",2021,TV-MA,96 min,Stand-Up Comedy,"With self-deprecating humor, French comic Blan..."
1892,s1893,Movie,A Go! Go! Cory Carson Halloween,"Alex Woo, Stanley Moore","Alan C. Lim, Smith Foreman, Maisie Benson, Ann...",,"October 2, 2020",2020,TV-Y,22 min,Children & Family Movies,"Cory, Chrissy and Freddie are on the hunt for ..."
1718,s1719,TV Show,Trial 4,,,United States,"November 11, 2020",2020,TV-MA,1 Season,"Crime TV Shows, Docuseries",Charged as a teen in the 1993 killing of a Bos...
4966,s4967,Movie,Trailer Park Boys: The Movie,Mike Clattenburg,"Robb Wells, John Paul Tremblay, Mike Smith, Lu...",Canada,"March 30, 2018",2006,R,95 min,"Comedies, Cult Movies","In this feature-length adventure, Ricky, Julia..."
5966,s5967,Movie,15-Aug,Swapnaneel Jayakar,"Rahul Pethe, Mrunmayee Deshpande, Adinath Koth...",India,"March 29, 2019",2019,TV-14,124 min,"Comedies, Dramas, Independent Movies","On India's Independence Day, a zany mishap in ..."
4701,s4702,Movie,For Here or to Go?,Rucha Humnabadkar,"Ali Fazal, Melanie Chandra, Rajit Kapoor, Amit...","United States, India","August 15, 2018",2015,TV-MA,105 min,"Comedies, Dramas, Independent Movies",A software engineer must decide if he'd rather...
5732,s5733,TV Show,Love Now,,"Annie Chen, George Hu, Bobby Dou, Vivi Lee, Ha...",Taiwan,"November 1, 2016",2012,TV-14,1 Season,"International TV Shows, Romantic TV Shows, TV ...",When her loved ones cook up a morbid plan to m...
3113,s3114,Movie,Falz Experience,Falz,"Falz, Adunni Ade, Bisola Aiyeola, Frank Donga",,"December 20, 2019",2018,TV-MA,82 min,"International Movies, Music & Musicals","Playing multiple characters, from lawyer to pr..."
7450,s7451,Movie,Mi Shivajiraje Bhosale Boltoy,Santosh Manjrekar,"Sachin Khedekar, Mahesh Manjrekar, Makarand An...",India,"January 1, 2018",2009,TV-14,148 min,"Comedies, International Movies, Sci-Fi & Fantasy",After a disgruntled man blames his Marathi her...
1990,s1991,TV Show,Challenger,,,United States,"September 16, 2020",2020,TV-14,1 Season,"Docuseries, Science & Nature TV","Engineers, officials and the crew members' fam..."


<h1 style="color:#39FF14;font-size:40px;"> Drop the duplicates</h1>

In [180]:
df=df.drop_duplicates()

<h1 style="color:#39FF14;font-size:40px;"> Check for NULL values</h1>

In [181]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

<h2 style= "color:#87CEEB; "> We can encode the missing 'director','country' and 'cast' column with either 'most frequent' or a seperate class called 'Unknown'</h2>

In [182]:
def replace_na_with_unknown(x):
    if not pd.isna(x):
        return x
    else:
        return 'Unknown'

In [183]:
df['director']=df['director'].apply(replace_na_with_unknown)
df['country']=df['country'].apply(replace_na_with_unknown)
df['cast']=df['cast'].apply(replace_na_with_unknown)

<h2 style= "color:#87CEEB; "> Replace NaN 'date_added','duration' and 'rating' with the mode(most frequent)</h2>

In [184]:
def rep_mode(df,col):
    mode=df[col].mode()[0]
    df[col]=df[col].fillna(mode)
    return df

In [185]:
df=rep_mode(df,'date_added')
df=rep_mode(df,'rating')
df=rep_mode(df,'duration')

In [186]:
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

<h1 style="color:#39FF14;font-size:40px;"> Fixing Data Types</h1>

In [187]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


<h2 style= "color:#87CEEB; "> Date column to datetime dtype</h2>

In [188]:
df['date_added']=df['date_added'].apply(lambda x: x.strip())

In [192]:
df['date_added']=pd.to_datetime(df['date_added'])

In [193]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


<h2 style= "color:#87CEEB; "> Convert columns which have multiple names 'cast','country','listed_in' and 'director' to be a list containing strings</h2>

In [195]:
def con_mstring_list(x):
    return x.split(',')
    

In [196]:
df['cast']=df['cast'].apply(con_mstring_list)
df['country']=df['country'].apply(con_mstring_list)
df['listed_in']=df['listed_in'].apply(con_mstring_list)
df['director']=df['director'].apply(con_mstring_list)

In [197]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,[Unknown],[United States],2021-09-25,2020,PG-13,90 min,[Documentaries],"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Unknown,"[Ama Qamata, Khosi Ngema, Gail Mabalane, Th...",[South Africa],2021-09-24,2021,TV-MA,2 Seasons,"[International TV Shows, TV Dramas, TV Myste...","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"[Sami Bouajila, Tracy Gotoas, Samuel Jouy, ...",[Unknown],2021-09-24,2021,TV-MA,1 Season,"[Crime TV Shows, International TV Shows, TV ...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Unknown,[Unknown],[Unknown],2021-09-24,2021,TV-MA,1 Season,"[Docuseries, Reality TV]","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Unknown,"[Mayur More, Jitendra Kumar, Ranjan Raj, Al...",[India],2021-09-24,2021,TV-MA,2 Seasons,"[International TV Shows, Romantic TV Shows, ...",In a city of coaching centers known to train I...


<h2 style= "color:#87CEEB; "> Create a new column for seasons and a seperate for duration in minutes</h2>

In [205]:
durations

Unnamed: 0,index,duration
0,0,90 min
1,1,2 Seasons
2,2,1 Season
3,3,1 Season
4,4,2 Seasons
...,...,...
8802,8802,158 min
8803,8803,2 Seasons
8804,8804,88 min
8805,8805,88 min


<h3> Show the unique categories:</h3>

In [220]:
((durations['duration'].apply(lambda x:x.split()[1])).reset_index())['duration'].unique()

array(['min', 'Seasons', 'Season'], dtype=object)

In [223]:
durations=df.iloc[:,9].reset_index()
mins=[]
seasons=[]
for  a in durations['duration']:
    if a.split()[1]=='Season' or a.split()[1]=='Seasons':
        seasons.append(int(a.split()[0]))
        mins.append(-1)
    else:
        mins.append(int(a.split()[0]))
        seasons.append(-1)

    
 

In [228]:
df['minutes']=mins
df['seasons']=seasons
df=df.drop(columns=['duration'])

In [229]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,minutes,seasons
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,[Unknown],[United States],2021-09-25,2020,PG-13,[Documentaries],"As her father nears the end of his life, filmm...",90,-1
1,s2,TV Show,Blood & Water,Unknown,"[Ama Qamata, Khosi Ngema, Gail Mabalane, Th...",[South Africa],2021-09-24,2021,TV-MA,"[International TV Shows, TV Dramas, TV Myste...","After crossing paths at a party, a Cape Town t...",-1,2
2,s3,TV Show,Ganglands,Julien Leclercq,"[Sami Bouajila, Tracy Gotoas, Samuel Jouy, ...",[Unknown],2021-09-24,2021,TV-MA,"[Crime TV Shows, International TV Shows, TV ...",To protect his family from a powerful drug lor...,-1,1
3,s4,TV Show,Jailbirds New Orleans,Unknown,[Unknown],[Unknown],2021-09-24,2021,TV-MA,"[Docuseries, Reality TV]","Feuds, flirtations and toilet talk go down amo...",-1,1
4,s5,TV Show,Kota Factory,Unknown,"[Mayur More, Jitendra Kumar, Ranjan Raj, Al...",[India],2021-09-24,2021,TV-MA,"[International TV Shows, Romantic TV Shows, ...",In a city of coaching centers known to train I...,-1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"[Mark Ruffalo, Jake Gyllenhaal, Robert Downe...",[United States],2019-11-20,2007,R,"[Cult Movies, Dramas, Thrillers]","A political cartoonist, a crime reporter and a...",158,-1
8803,s8804,TV Show,Zombie Dumb,Unknown,[Unknown],[Unknown],2019-07-01,2018,TV-Y7,"[Kids' TV, Korean TV Shows, TV Comedies]","While living alone in a spooky town, a young g...",-1,2
8804,s8805,Movie,Zombieland,Ruben Fleischer,"[Jesse Eisenberg, Woody Harrelson, Emma Ston...",[United States],2019-11-01,2009,R,"[Comedies, Horror Movies]",Looking to survive in a world taken over by zo...,88,-1
8805,s8806,Movie,Zoom,Peter Hewitt,"[Tim Allen, Courteney Cox, Chevy Chase, Kat...",[United States],2020-01-11,2006,PG,"[Children & Family Movies, Comedies]","Dragged from civilian life, a former superhero...",88,-1


In [235]:
df['release_year']=df['release_year'].apply(lambda x: int(x))

In [237]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,minutes,seasons
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,[Unknown],[United States],2021-09-25,2020,PG-13,[Documentaries],"As her father nears the end of his life, filmm...",90,-1
1,s2,TV Show,Blood & Water,Unknown,"[Ama Qamata, Khosi Ngema, Gail Mabalane, Th...",[South Africa],2021-09-24,2021,TV-MA,"[International TV Shows, TV Dramas, TV Myste...","After crossing paths at a party, a Cape Town t...",-1,2
2,s3,TV Show,Ganglands,Julien Leclercq,"[Sami Bouajila, Tracy Gotoas, Samuel Jouy, ...",[Unknown],2021-09-24,2021,TV-MA,"[Crime TV Shows, International TV Shows, TV ...",To protect his family from a powerful drug lor...,-1,1
3,s4,TV Show,Jailbirds New Orleans,Unknown,[Unknown],[Unknown],2021-09-24,2021,TV-MA,"[Docuseries, Reality TV]","Feuds, flirtations and toilet talk go down amo...",-1,1
4,s5,TV Show,Kota Factory,Unknown,"[Mayur More, Jitendra Kumar, Ranjan Raj, Al...",[India],2021-09-24,2021,TV-MA,"[International TV Shows, Romantic TV Shows, ...",In a city of coaching centers known to train I...,-1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"[Mark Ruffalo, Jake Gyllenhaal, Robert Downe...",[United States],2019-11-20,2007,R,"[Cult Movies, Dramas, Thrillers]","A political cartoonist, a crime reporter and a...",158,-1
8803,s8804,TV Show,Zombie Dumb,Unknown,[Unknown],[Unknown],2019-07-01,2018,TV-Y7,"[Kids' TV, Korean TV Shows, TV Comedies]","While living alone in a spooky town, a young g...",-1,2
8804,s8805,Movie,Zombieland,Ruben Fleischer,"[Jesse Eisenberg, Woody Harrelson, Emma Ston...",[United States],2019-11-01,2009,R,"[Comedies, Horror Movies]",Looking to survive in a world taken over by zo...,88,-1
8805,s8806,Movie,Zoom,Peter Hewitt,"[Tim Allen, Courteney Cox, Chevy Chase, Kat...",[United States],2020-01-11,2006,PG,"[Children & Family Movies, Comedies]","Dragged from civilian life, a former superhero...",88,-1


In [238]:
df.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
listed_in               object
description             object
minutes                  int64
seasons                  int64
dtype: object

<h1 style="color:#39FF14;font-size:40px;"> Lastly remove trailing and leading whitespaces from string columns</h1>

In [249]:
def rem_ws(x):
    return x.strip()

In [252]:
df['title']=df['title'].apply(rem_ws)
df['description']=df['description'].apply(rem_ws)

<h1 style="color:#39FF14;font-size:40px;"> Final Result:-</h1>

In [265]:
df.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
listed_in               object
description             object
minutes                  int64
seasons                  int64
dtype: object

In [267]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date_added,8807.0,2019-05-17 12:13:09.735437568,2008-01-01 00:00:00,2018-04-06 00:00:00,2019-07-04 00:00:00,2020-08-18 00:00:00,2021-09-25 00:00:00,
release_year,8807.0,2014.180198,1925.0,2013.0,2017.0,2019.0,2021.0,8.819312
minutes,8807.0,68.982627,-1.0,-1.0,88.0,106.0,312.0,51.944202
seasons,8807.0,-0.159192,-1.0,-1.0,-1.0,1.0,17.0,1.542229


In [268]:
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
listed_in       0
description     0
minutes         0
seasons         0
dtype: int64

In [269]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,minutes,seasons
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,[Unknown],[United States],2021-09-25,2020,PG-13,[Documentaries],"As her father nears the end of his life, filmm...",90,-1
1,s2,TV Show,Blood & Water,Unknown,"[Ama Qamata, Khosi Ngema, Gail Mabalane, Th...",[South Africa],2021-09-24,2021,TV-MA,"[International TV Shows, TV Dramas, TV Myste...","After crossing paths at a party, a Cape Town t...",-1,2
2,s3,TV Show,Ganglands,Julien Leclercq,"[Sami Bouajila, Tracy Gotoas, Samuel Jouy, ...",[Unknown],2021-09-24,2021,TV-MA,"[Crime TV Shows, International TV Shows, TV ...",To protect his family from a powerful drug lor...,-1,1
3,s4,TV Show,Jailbirds New Orleans,Unknown,[Unknown],[Unknown],2021-09-24,2021,TV-MA,"[Docuseries, Reality TV]","Feuds, flirtations and toilet talk go down amo...",-1,1
4,s5,TV Show,Kota Factory,Unknown,"[Mayur More, Jitendra Kumar, Ranjan Raj, Al...",[India],2021-09-24,2021,TV-MA,"[International TV Shows, Romantic TV Shows, ...",In a city of coaching centers known to train I...,-1,2


In [270]:
df.to_csv('final_result.csv',index=False)