# Netflix

In [1]:
#imports
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
df = pd.read_csv(r"C:\Users\karim\Desktop\CDSP\Datasets\netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [4]:
df.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


# EDA and Data cleaning

In [5]:
# Checking for null vals
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

* A total of 2634 null values for director column - Needs to be fixed/filled with unknown - can not be removed
* 825 Null values for cast column - Fill with null - can not be removed
* 831 Null values for country - Needs to be fixed/filled with unknown - can not be removed
* Rating (4) - Duration (3) - Duration will be dropped - Rating will be replaced with mean rating value
* Date added 10 but can be subbed by the release year

In [6]:
# show_id should be unique ID for every show
# Checking if duplicate IDs exist
df['show_id'].duplicated().value_counts()

False    8807
Name: show_id, dtype: int64

In [7]:
# Checking if duplicate titles exist
df['title'].duplicated().value_counts()

False    8807
Name: title, dtype: int64

In [8]:
# Let's start by cleaning the null values for director column
# First let's check if director column has any relation with cast column by using a data without any null values
cleanedDf = df.dropna()

In [9]:
cleanedDf['director'].isin(cleanedDf['cast']).value_counts()

False    5267
True       65
Name: director, dtype: int64

* Very weak relation between director and cast cols thus can't assume a value

In [10]:
# Fill null vals with unknown in director&cast
df['cast'].replace(np.nan, 'unknown', inplace = True)
df['director'].replace(np.nan, 'unknown', inplace = True)

In [11]:
#Fixing country col by replacing all nan values with the most frequent country
df['country'].value_counts()

United States                             2818
India                                      972
United Kingdom                             419
Japan                                      245
South Korea                                199
                                          ... 
Romania, Bulgaria, Hungary                   1
Uruguay, Guatemala                           1
France, Senegal, Belgium                     1
Mexico, United States, Spain, Colombia       1
United Arab Emirates, Jordan                 1
Name: country, Length: 748, dtype: int64

In [12]:
df['country'] = df['country'].fillna(df['country'].mode()[0])

In [13]:
df['country'].value_counts()

United States                             3649
India                                      972
United Kingdom                             419
Japan                                      245
South Korea                                199
                                          ... 
Romania, Bulgaria, Hungary                   1
Uruguay, Guatemala                           1
France, Senegal, Belgium                     1
Mexico, United States, Spain, Colombia       1
United Arab Emirates, Jordan                 1
Name: country, Length: 748, dtype: int64

In [14]:
# Replacing rating with most occurent rating
df['rating'].value_counts()

TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: rating, dtype: int64

In [15]:
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])

In [16]:
#Dropping NaN vals in the rest of the data
df.dropna(inplace=True)

In [17]:
# Transforming date_added column to date time format and creating seperate columns for year and month
df['date_added'] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month


In [18]:
# Final look at our data after cleaning
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8794 entries, 0 to 8806
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8794 non-null   object        
 1   type          8794 non-null   object        
 2   title         8794 non-null   object        
 3   director      8794 non-null   object        
 4   cast          8794 non-null   object        
 5   country       8794 non-null   object        
 6   date_added    8794 non-null   datetime64[ns]
 7   release_year  8794 non-null   int64         
 8   rating        8794 non-null   object        
 9   duration      8794 non-null   object        
 10  listed_in     8794 non-null   object        
 11  description   8794 non-null   object        
 12  year_added    8794 non-null   int64         
 13  month_added   8794 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(10)
memory usage: 1.0+ MB


# Data visualization

# Pie chart between the number of Movies and TV Shows

In [80]:
# Ratio between TV Shows and Movies on netflix
plot = px.pie(df, values=df['type'].value_counts().values, names=df['type'].value_counts().index, 
color_discrete_sequence=['#db0000', '#000000'], title='Number of TV Shows vs Movies on Netflix'
             )
plot.update_layout(title_font_color='#000000')
plot.show()

# Top 10 contributing countries

In [182]:
# Filtering our data by getting the top 10 contributing countries and putting it in a separte df
top10 = df['country'].value_counts().nlargest(10).index
top10df = df[df['country'].isin(top10)]
plot = px.histogram(top10df, y='country', color='type', orientation='h', 
color_discrete_map= {"Movie" : 'red', "TV Show" : "black"},
title="Top 10 contributing Countries ranked", barmode='group')
plot.update_layout(title_font_color='#000000', xaxis_title='Number of content')
plot.update_yaxes(categoryorder="total ascending", title='Country')
plot.show()

# Line chart for content added by year

In [347]:
# Filtering our data by creating a new dataframe that holds the value count for each type for each year
# Creating the new data frame
s = df.groupby('year_added')['type'].value_counts()
newdf = pd.DataFrame(data=s.values, index=s.index)
newdf = newdf.reset_index()
newdf.rename(columns={0:'Count'}, inplace=True)
# Visualizing the data
plot = px.line(newdf, x='year_added', y='Count', color='type', markers=True, 
               color_discrete_map = {"Movie" : "#db0000", "TV Show" : "#000000"})
plot.update_layout(title='Line chart for content added among the years',
    xaxis_title='Year',
    yaxis_title='Number of content added', hovermode='x unified')
plot.show()

# Most actors that appeared on Netflix platform

In [348]:
# Creating new dataframe without the unknown cast rows
castdf = df[df['cast'] != 'unknown']

In [292]:
# Fucntion to each row and split the cast string into a list, and then creating a dictionary with the names
# Everytime the name appears the value is incremented by 1 
# This function returns a dictionary for the name of the actor as key and the value is the number of appearance
def get_actor_appearance(x, cdic):
    templist = x.split(', ')
    for name in templist:
        if name in cdic:
            cdic[name] = cdic[name] + 1
        else:
            cdic[name] = 1
    
    return cdic


In [302]:
# Applying the function and switching the dictionary into a pandas series
castdict = {}
castdf['cast'].apply(lambda x: get_actor_appearance(x, castdict))
castdict = pd.Series(castdict)
castdict.describe()

count    36403.000000
mean         1.759580
std          1.885462
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         43.000000
dtype: float64

In [343]:
# Taking the top 25 actor appearances and plotting the graph
castdictsample = castdict.nlargest(25)
plot = px.bar(castdictsample, x=castdictsample.index, y=castdictsample.values)
plot.update_layout(title = 'Top 25 actors by appearance', xaxis_title= 'Actor name', yaxis=dict(title='Number of appearances'))
plot.update_traces(marker_color='red')
plot.show()

# Pie chart for the movie & TV shows rating

In [359]:
plot = px.pie(df, values=df['rating'].value_counts().values, names=df['rating'].value_counts().index)
plot.update_traces(text=df['rating'].value_counts().index)
plot.show()