<a href="https://colab.research.google.com/github/Mrraven922/Netflix_Data_Analysis/blob/main/Netflix_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import Libaries and data

import numpy as np # linear algebra operations
import pandas as pd # used for data preparation
import plotly.express as px #used for data visualization
from textblob import TextBlob #used for sentiment analysis

df = pd.read_csv('netflix_titles.csv')


In [2]:
#Checking number of rows and columns in data

df.shape

(14958, 12)

In [3]:
#Checking content available in Dataset

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
#Checking number of rows and columns in data

df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [5]:
#Taking the count of ratings available

x = df.groupby(['rating']).size().reset_index(name='counts')
print(x)

      rating  counts
0   1 Season       1
1     66 min       2
2     74 min       2
3     84 min       2
4     92 min       1
5          G      78
6      NC-17       6
7         NR     160
8         PG     479
9      PG-13     798
10         R    1346
11     TV-14    3682
12      TV-G     346
13     TV-MA    5477
14     TV-PG    1516
15      TV-Y     489
16     TV-Y7     547
17  TV-Y7-FV      12
18        UR       6


In [6]:
#Creating the Piechart based on Content rating


pieChart = px.pie(x, values='counts', names='rating', title='Distribution of content ratings on Netflix')
pieChart.show()


In [7]:
#Analyzing the top 5 Directors on Netflix


df['director']=df['director'].fillna('Director not specified')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Director not specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Director not specified,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Director not specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [8]:
#Creating an empty DataFrame to store Netflix directors data


directors_list = pd.DataFrame()
print(directors_list)

Empty DataFrame
Columns: []
Index: []


In [9]:
#Splitting multiple directors listed in a single row

directors_list = df['director'].str.split(',', expand=True).stack()
print(directors_list)


0      0           Kirsten Johnson
1      0    Director not specified
2      0           Julien Leclercq
3      0    Director not specified
4      0    Director not specified
                     ...          
14953  0             David Fincher
14954  0    Director not specified
14955  0           Ruben Fleischer
14956  0              Peter Hewitt
14957  0               Mozez Singh
Length: 16359, dtype: object


In [10]:
#Transforming the list of directors into a structured DataFrame

directors_list = directors_list.to_frame()
print(directors_list)

                              0
0     0         Kirsten Johnson
1     0  Director not specified
2     0         Julien Leclercq
3     0  Director not specified
4     0  Director not specified
...                         ...
14953 0           David Fincher
14954 0  Director not specified
14955 0         Ruben Fleischer
14956 0            Peter Hewitt
14957 0             Mozez Singh

[16359 rows x 1 columns]


In [11]:
# Renaming the column to 'Director' for clarity

directors_list.columns = ['Director']
print(directors_list)

                       Director
0     0         Kirsten Johnson
1     0  Director not specified
2     0         Julien Leclercq
3     0  Director not specified
4     0  Director not specified
...                         ...
14953 0           David Fincher
14954 0  Director not specified
14955 0         Ruben Fleischer
14956 0            Peter Hewitt
14957 0             Mozez Singh

[16359 rows x 1 columns]


In [12]:
#Calculating how often each director appears

directors = directors_list.groupby(['Director']).size().reset_index(name='Total Count')
print(directors)

                       Director  Total Count
0                 Aaron Marsden            1
1                Aaron Moorhead            3
2                   Aaron Woolf            2
3      Abbas Alibhai Burmawalla            2
4              Abdullah Al Noor            2
...                         ...          ...
5120                Çagan Irmak            2
5121           Ísold Uggadóttir            2
5122        Óskar Thór Axelsson            2
5123           Ömer Faruk Sorak            3
5124               Şenol Sönmez            3

[5125 rows x 2 columns]


In [13]:
#Filtering out 'Director not specified' entries

directors = directors[directors.Director != 'Director not specified']
print(directors)

                       Director  Total Count
0                 Aaron Marsden            1
1                Aaron Moorhead            3
2                   Aaron Woolf            2
3      Abbas Alibhai Burmawalla            2
4              Abdullah Al Noor            2
...                         ...          ...
5120                Çagan Irmak            2
5121           Ísold Uggadóttir            2
5122        Óskar Thór Axelsson            2
5123           Ömer Faruk Sorak            3
5124               Şenol Sönmez            3

[5124 rows x 2 columns]


In [14]:
#Sorted the directors by frequency

directors = directors.sort_values(by=['Total Count'], ascending = False)
print(directors)


              Director  Total Count
4072       Raúl Campos           36
262          Jan Suter           36
3240      Marcus Raboy           31
2455         Jay Karas           30
4025     Rajiv Chilaka           27
...                ...          ...
3903   Peter Thorwarth            1
3899     Peter Sollett            1
3920  Philip Barantini            1
3918     Phil Sgriccia            1
3914         Phil Lord            1

[5124 rows x 2 columns]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [15]:
# Extracting the top 5 directors from the sorted datase

top5Directors = directors.head()
print(top5Directors)

           Director  Total Count
4072    Raúl Campos           36
262       Jan Suter           36
3240   Marcus Raboy           31
2455      Jay Karas           30
4025  Rajiv Chilaka           27


In [17]:
#Analyzing the top 5 Actors on Netflix


df['cast']=df['cast'].fillna('No cast specified')
cast_df = pd.DataFrame()
cast_df = df['cast'].str.split(',',expand=True).stack()
cast_df = cast_df.to_frame()
cast_df.columns = ['Actor']
actors = cast_df.groupby(['Actor']).size().reset_index(name = 'Total Count')
actors = actors[actors.Actor != 'No cast specified']
actors = actors.sort_values(by=['Total Count'], ascending=False)
top5Actors = actors.head()
top5Actors = top5Actors.sort_values(by=['Total Count'])
barChart2 = px.bar(top5Actors, x='Total Count', y='Actor', title='Top 5 Actors on Netflix')
barChart2.show()


In [22]:
#Analyzing the content produced on netflix based on years


df3 = df[['release_year', 'description']]
df3 = df3.rename(columns = {'release_year':'Release Year', 'description':'Description'})

# Convert 'Release Year' to numeric, coercing errors to NaN
df3['Release Year'] = pd.to_numeric(df3['Release Year'], errors='coerce')

# Drop rows where 'Release Year' is NaN
df3 = df3.dropna(subset=['Release Year'])

for index, row in df3.iterrows():
  d=row['Description']
  if isinstance(d, str): # Check if the description is a string
    testimonial = TextBlob(d)
    p = testimonial.sentiment.polarity
    if p==0:
      sent = 'Neutral'
    elif p>0:
      sent = 'Positive'
    else:
      sent = 'Negative'
  else:
    sent = 'Neutral' # Assign 'Neutral' sentiment for non-string values

  df3.loc[[index, 2], 'Sentiment']=sent

df3 = df3.groupby(['Release Year', 'Sentiment']).size().reset_index(name = 'Total Count')

df3 = df3[df3['Release Year']>2005]
barGraph = px.bar(df3, x="Release Year", y="Total Count", color = "Sentiment", title = "Sentiment Analysis of Content on Netflix")
barGraph.show()