# Netflix EDA Project

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

pd.set_option("display.max_columns", None)

In [2]:
df = pd.read_csv("netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
len(df)

8807

In [4]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [5]:
df.isna().sum() / len(df) * 100

show_id          0.000000
type             0.000000
title            0.000000
director        29.908028
cast             9.367549
country          9.435676
date_added       0.113546
release_year     0.000000
rating           0.045418
duration         0.034064
listed_in        0.000000
description      0.000000
dtype: float64

In [6]:
directors_countries = (
    df.groupby('director')['country']
      .unique()
      .reset_index()
      .dropna(subset=['country'])
)

directors_countries = directors_countries.explode('country')

directors_countries['country'] = directors_countries['country'].str.strip()

directors_countries


Unnamed: 0,director,country
0,A. L. Vijay,India
1,A. Raajdheep,India
2,A. Salaam,India
3,A.R. Murugadoss,
3,A.R. Murugadoss,India
...,...,...
4523,Çagan Irmak,Turkey
4524,Ísold Uggadóttir,"Iceland, Sweden, Belgium"
4525,Óskar Thór Axelsson,Iceland
4526,Ömer Faruk Sorak,Turkey


In [7]:
df["cast"].fillna("No Cast Listed", inplace=True)

director_country_map = (
    directors_countries.groupby('director')['country']
    .apply(lambda x: ', '.join([str(c) for c in x.unique() if pd.notna(c)]))
    .to_dict()
)

df['country'] = df.apply(
    lambda row: director_country_map.get(row['director'], row['country'])
    if pd.isna(row['country']) else row['country'],
    axis=1
)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["cast"].fillna("No Cast Listed", inplace=True)


In [8]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast               0
country          409
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [9]:
df.fillna({'country': 'Unknown'}, inplace=True)
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast               0
country            0
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [10]:
df.drop(columns=['director'], inplace=True)
df.isna().sum()

show_id          0
type             0
title            0
cast             0
country          0
date_added      10
release_year     0
rating           4
duration         3
listed_in        0
description      0
dtype: int64

In [11]:
df[df['date_added'].isna()]

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
6066,s6067,TV Show,A Young Doctor's Notebook and Other Stories,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","Set during the Russian Revolution, this comic ..."
6174,s6175,TV Show,Anthony Bourdain: Parts Unknown,Anthony Bourdain,United States,,2018,TV-PG,5 Seasons,Docuseries,This CNN original series has chef Anthony Bour...
6795,s6796,TV Show,Frasier,"Kelsey Grammer, Jane Leeves, David Hyde Pierce...",United States,,2003,TV-PG,11 Seasons,"Classic & Cult TV, TV Comedies",Frasier Crane is a snooty but lovable Seattle ...
6806,s6807,TV Show,Friends,"Jennifer Aniston, Courteney Cox, Lisa Kudrow, ...",United States,,2003,TV-14,10 Seasons,"Classic & Cult TV, TV Comedies",This hit sitcom follows the merry misadventure...
6901,s6902,TV Show,Gunslinger Girl,"Yuuka Nanri, Kanako Mitsuhashi, Eri Sendai, Am...",Japan,,2008,TV-14,2 Seasons,"Anime Series, Crime TV Shows","On the surface, the Social Welfare Agency appe..."
7196,s7197,TV Show,Kikoriki,Igor Dmitriev,Unknown,,2010,TV-Y,2 Seasons,Kids' TV,A wacky rabbit and his gang of animal pals hav...
7254,s7255,TV Show,La Familia P. Luche,"Eugenio Derbez, Consuelo Duval, Luis Manuel Áv...",United States,,2012,TV-14,3 Seasons,"International TV Shows, Spanish-Language TV Sh...","This irreverent sitcom featues Ludovico, Feder..."
7406,s7407,TV Show,Maron,"Marc Maron, Judd Hirsch, Josh Brener, Nora Zeh...",United States,,2016,TV-MA,4 Seasons,TV Comedies,"Marc Maron stars as Marc Maron, who interviews..."
7847,s7848,TV Show,Red vs. Blue,"Burnie Burns, Jason Saldaña, Gustavo Sorola, G...",United States,,2015,NR,13 Seasons,"TV Action & Adventure, TV Comedies, TV Sci-Fi ...","This parody of first-person shooter games, mil..."
8182,s8183,TV Show,The Adventures of Figaro Pho,"Luke Jurevicius, Craig Behenna, Charlotte Haml...",Australia,,2015,TV-Y7,2 Seasons,"Kids' TV, TV Comedies","Imagine your worst fears, then multiply them: ..."


In [12]:
date_added_map = {
    "A Young Doctor's Notebook and Other Stories": "December 9, 2016",
    "Anthony Bourdain: Parts Unknown": "December 1, 2014",
    "Frasier": "December 30, 2020",
    "Friends": "January 1, 2015",
    "Gunslinger Girl": "March 8, 2013",
    "Kikoriki": "November 1, 2015",
    "La Familia P. Luche": "July 19, 2016",
    "Maron": "August 14, 2013",
    "Red vs. Blue": "April 1, 2014",
    "The Adventures of Figaro Pho": "February 15, 2015"
}

df['date_added'] = df.apply(
    lambda row: date_added_map.get(row['title'], row['date_added']),
    axis=1
)

df.isna().sum()

show_id         0
type            0
title           0
cast            0
country         0
date_added      0
release_year    0
rating          4
duration        3
listed_in       0
description     0
dtype: int64

In [13]:
df[df[['rating', 'duration']].isna().any(axis=1)]

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
5541,s5542,Movie,Louis C.K. 2017,Louis C.K.,United States,"April 4, 2017",2017,74 min,,Movies,"Louis C.K. muses on religion, eternal love, gi..."
5794,s5795,Movie,Louis C.K.: Hilarious,Louis C.K.,United States,"September 16, 2016",2010,84 min,,Movies,Emmy-winning comedy writer Louis C.K. brings h...
5813,s5814,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,United States,"August 15, 2016",2015,66 min,,Movies,The comic puts his trademark hilarious/thought...
5989,s5990,Movie,13TH: A Conversation with Oprah Winfrey & Ava ...,"Oprah Winfrey, Ava DuVernay",Unknown,"January 26, 2017",2017,,37 min,Movies,Oprah Winfrey sits down with director Ava DuVe...
6827,s6828,TV Show,Gargantia on the Verdurous Planet,"Kaito Ishikawa, Hisako Kanemoto, Ai Kayano, Ka...",Japan,"December 1, 2016",2013,,1 Season,"Anime Series, International TV Shows","After falling through a wormhole, a space-dwel..."
7312,s7313,TV Show,Little Lunch,"Flynn Curry, Olivia Deeble, Madison Lu, Oisín ...",Australia,"February 1, 2018",2015,,1 Season,"Kids' TV, TV Comedies","Adopting a child's perspective, this show take..."
7537,s7538,Movie,My Honor Was Loyalty,"Leone Frisa, Paolo Vaccarino, Francesco Miglio...",Italy,"March 1, 2017",2015,,115 min,Dramas,"Amid the chaos and horror of World War II, a c..."


In [14]:
fill_values = {
    "Louis C.K. 2017": {"duration": "74 min", "rating": "TV-MA"},
    "Louis C.K.: Hilarious": {"duration": "84 min", "rating": "TV-MA"},
    "Louis C.K.: Live at the Comedy Store": {"duration": "66 min", "rating": "TV-MA"},
    "13TH: A Conversation with Oprah Winfrey & Ava DuVernay": {"duration": "37 min", "rating": "TV-14"},
    "Gargantia on the Verdurous Planet": {"duration": "1 Season", "rating": "TV-14"},
    "Little Lunch": {"duration": "1 Season", "rating": "TV-Y7"},
    "My Honor Was Loyalty": {"duration": "115 min", "rating": "TV-14"}
}

for title, vals in fill_values.items():
    for col, val in vals.items():
        df.loc[df['title'] == title, col] = df.loc[df['title'] == title, col].fillna(val)

df.isna().sum()

show_id         0
type            0
title           0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [15]:
df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR',
       'TV-Y7-FV', 'UR'], dtype=object)

In [16]:
df[df['rating'].isin(['74 min', '84 min', '66 min'])]

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in,description
5541,s5542,Movie,Louis C.K. 2017,Louis C.K.,United States,"April 4, 2017",2017,74 min,74 min,Movies,"Louis C.K. muses on religion, eternal love, gi..."
5794,s5795,Movie,Louis C.K.: Hilarious,Louis C.K.,United States,"September 16, 2016",2010,84 min,84 min,Movies,Emmy-winning comedy writer Louis C.K. brings h...
5813,s5814,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,United States,"August 15, 2016",2015,66 min,66 min,Movies,The comic puts his trademark hilarious/thought...


In [17]:
df.loc[df['rating'].isin(['74 min', '84 min', '66 min']), 'rating'] = 'TV-MA'
df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR'], dtype=object)

In [18]:
df['type'].unique()

array(['Movie', 'TV Show'], dtype=object)

In [19]:
df.to_csv("netflix_titles_cleaned.csv", index=False)