In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_columns', None)


In [None]:
#Load Data and Basic Analysis

In [None]:
data = pd.read_csv('netflix_titles.csv')
display(data.sample(3))
print('Data Shape: ', data.shape)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
3474,s3475,Movie,Kurt & Courtney,Nick Broomfield,"Kurt Cobain, Courtney Love",United Kingdom,"January 8, 2019",1998,R,95 min,"Documentaries, Music & Musicals",This film charts Nirvana's Kurt Cobain's rise ...
4799,s4800,Movie,Paul Blart: Mall Cop,Steve Carr,"Kevin James, Keir O'Donnell, Jayma Mays, Raini...",United States,"November 1, 2020",2009,PG,91 min,"Action & Adventure, Comedies",An overzealous security guard finds himself in...
28,s29,Movie,#AnneFrank - Parallel Stories,"Sabina Fedeli, Anna Migotto","Helen Mirren, Gengher Gatti",Italy,"July 1, 2020",2019,TV-14,95 min,"Documentaries, International Movies","Through her diary, Anne Frank's story is retol..."


Data Shape:  (7787, 12)


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [None]:
data.isnull().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

In [None]:
data['date_added'] = data['date_added'].fillna('NaN Data')
data['year'] = data['date_added'].apply(lambda x: x[-4: len(x)])
data['month'] = data['date_added'].apply(lambda x: x.split(' ')[0])

display(data.sample(3))

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year,month
3493,s3494,TV Show,La Femme,,"Zoe Tay, Ann Kok, Tiffany Leong, Tay Ping Hui,...",,"October 16, 2017",2016,TV-14,1 Season,"International TV Shows, TV Dramas",Personal desires guide the lives of a marriage...,2017,October
6599,s6600,Movie,The Little Hours,Jeff Baena,"Alison Brie, Dave Franco, Kate Micucci, Aubrey...","Canada, United States","December 23, 2018",2017,R,89 min,"Comedies, Independent Movies",Life at a convent takes an unruly turn when th...,2018,December
4256,s4257,Movie,Mr. Romantic,Ahmed Al-Badry,"Mohamed Emam, Hassan Hosny, Lebleba, Medhat Te...",,"June 2, 2020",2009,TV-14,97 min,"Comedies, International Movies, Romantic Movies",When a carefree man is left with his uncle's f...,2020,June


In [None]:
#There are only two major types in netflix watch: TV Shows and Movies

In [None]:
val = data['type'].value_counts().index
cnt = data['type'].value_counts().values

fig = go.Figure([go.Bar(x=val, y=cnt, marker_color='darkturquoise')])
fig.update_layout(title_text='Netflix Sources Distribution', title_x=0.5)
fig.show()

In [None]:
#Now lets look at the trend using bar plot.

In [None]:
from collections import defaultdict

dict = data.groupby(['type', 'year']).groups
dict2 = {}
dict2 = defaultdict(lambda: 0, dict2)
for key, values in dict.items():
    val = key[0]+','+key[1]
    dict2[val] = len(values)

x = list(np.arange(2008, 2022, 1))

y1, y2= [], []
for i in x:
    y1.append(dict2['Movie,'+str(i)])
    y2.append(dict2['TV Show,'+str(i)])

fig = go.Figure(data = [
    go.Bar(name='Movie', x=x, y=y1, marker_color='mediumpurple'),
    go.Bar(name='TV Show', x=x, y=y2, marker_color='lightcoral')
])
fig.update_layout(title_text='Trend Movies vs TV Shows in recent years', title_x=0.5)
fig.show()

In [None]:
dict = data.groupby(['type', 'month']).groups
dict2 = {}
dict2 = defaultdict(lambda: 0, dict2)
for key, values in dict.items():
    val = key[0]+','+key[1]
    dict2[val] = len(values)

x = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
     'August', 'September', 'October', 'November', 'December']

y1, y2= [], []
for i in x:
    y1.append(dict2['Movie,'+str(i)])
    y2.append(dict2['TV Show,'+str(i)])

fig = go.Figure(data = [
    go.Bar(name='Movie', x=x, y=y1, marker_color='mediumpurple'),
    go.Bar(name='TV Show', x=x, y=y2, marker_color='lightcoral')
])
fig.update_layout(title_text='Trend Movies vs TV Shows during Months', title_x=0.5)
fig.show()

In [None]:
data_movie = data[data['type']=='Movie'].groupby('release_year').count()
data_tv = data[data['type']=='TV Show'].groupby('release_year').count()
data_movie.reset_index(level=0, inplace=True)
data_tv.reset_index(level=0, inplace=True)

# fig = px.line(data_movie, x="release_year", y="show_id")
# fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=data_movie['release_year'], y=data_movie['show_id'],
                    mode='lines',
                    name='Movies', marker_color='mediumpurple'))
fig.add_trace(go.Scatter(x=data_tv['release_year'], y=data_tv['show_id'],
                    mode='lines',
                    name='TV Shows', marker_color='lightcoral'))
fig.update_layout(title_text='Trend Movies vs TV Shows in recent years', title_x=0.5)
fig.show()

In [None]:
import collections
import string

dict1 = {}
dict1 = defaultdict(lambda: 0, dict1)
dict2 = {}
dict2 = defaultdict(lambda: 0, dict2)

data['country'] = data['country'].fillna(' ')

for i in range(len(data)):
    if data['type'][i] == 'Movie':
        val = data['country'][i].split(',')
        for j in val:
            x = j.lower()
            x = x.strip()
            if x!='':
                dict1[x]+=1
    else:
        val = data['country'][i].split(',')
        for j in val:
            x = j.lower()
            x = x.strip()
            if x!='':
                dict2[x]+=1

dict1 = collections.OrderedDict(sorted(dict1.items(), key=lambda x: x[1], reverse=True))
dict2 = collections.OrderedDict(sorted(dict2.items(), key=lambda x: x[1], reverse=True))

x1 = list(dict1.keys())[:20]
x2 = list(dict2.keys())[:20]
y1 = list(dict1.values())[:20]
y2 = list(dict2.values())[:20]

fig = go.Figure([go.Bar(x=x1, y=y1, marker_color='mediumpurple')])
fig.update_layout(title_text='Top Countries where Movies are released', title_x=0.5)
fig.show()

fig = go.Figure([go.Bar(x=x2, y=y2, marker_color='lightcoral')])
fig.update_layout(title_text='Top Countries where TV Shows are released', title_x=0.5)
fig.show()

In [None]:
dict1 = {}
dict1 = defaultdict(lambda: 0, dict1)
dict2 = {}
dict2 = defaultdict(lambda: 0, dict2)

data['cast'] = data['cast'].fillna(' ')

for i in range(len(data)):
    if data['type'][i] == 'Movie':
        val = data['cast'][i].split(',')
        for j in val:
            x = j.lower()
            x = x.strip()
            if x!='':
                dict1[x]+=1
    else:
        val = data['cast'][i].split(',')
        for j in val:
            x = j.lower()
            x = x.strip()
            if x!='':
                dict2[x]+=1

dict1 = collections.OrderedDict(sorted(dict1.items(), key=lambda x: x[1], reverse=True))
dict2 = collections.OrderedDict(sorted(dict2.items(), key=lambda x: x[1], reverse=True))

x1 = list(dict1.keys())[:20]
x2 = list(dict2.keys())[:20]
y1 = list(dict1.values())[:20]
y2 = list(dict2.values())[:20]

fig = go.Figure([go.Bar(x=x1, y=y1, marker_color='mediumpurple')])
fig.update_layout(title_text='Most appeared Cast Globally in Movies', title_x=0.5)
fig.show()

fig = go.Figure([go.Bar(x=x2, y=y2, marker_color='lightcoral')])
fig.update_layout(title_text='Most appeared Cast Globally in TV Shows', title_x=0.5)
fig.show()

In [None]:
dict1 = {}
dict1 = defaultdict(lambda: 0, dict1)
dict2 = {}
dict2 = defaultdict(lambda: 0, dict2)

data['listed_in'] = data['listed_in'].fillna(' ')

for i in range(len(data)):
    if data['type'][i] == 'Movie':
        val = data['listed_in'][i].split(',')
        for j in val:
            x = j.lower()
            x = x.strip()
            if x!='':
                dict1[x]+=1
    else:
        val = data['listed_in'][i].split(',')
        for j in val:
            x = j.lower()
            x = x.strip()
            if x!='':
                dict2[x]+=1

dict1 = collections.OrderedDict(sorted(dict1.items(), key=lambda x: x[1], reverse=True))
dict2 = collections.OrderedDict(sorted(dict2.items(), key=lambda x: x[1], reverse=True))

x1 = list(dict1.keys())[:20]
x2 = list(dict2.keys())[:20]
y1 = list(dict1.values())[:20]
y2 = list(dict2.values())[:20]

fig = go.Figure([go.Bar(x=x1, y=y1, marker_color='mediumpurple')])
fig.update_layout(title_text='Highest occurring genres Globally in Movies', title_x=0.5)
fig.show()

fig = go.Figure([go.Bar(x=x2, y=y2, marker_color='lightcoral')])
fig.update_layout(title_text='Highest occurring genres Globally in TV Shows', title_x=0.5)
fig.show()

In [None]:
dict2 = {}
dict2 = defaultdict(lambda: 0, dict2)

data2 = data
data2['country'] = data2['country'].apply(lambda x: x.lower())
data2['listed_in'] = data2['listed_in'].apply(lambda x: x.lower())

df1 = pd.DataFrame(columns=['Country', 'Genre', 'Count'])

for i in range(len(data2)):
    for j in data2['country'][i].split(','):
        for k in data2['listed_in'][i].split(','):
            val = j+','+k
            dict2[val]+=1

dict2 = collections.OrderedDict(sorted(dict2.items(), key=lambda x: x[1], reverse=True))

a, b, c = 0, 0, 0
for k,v in dict2.items():
    if k.split(',')[0] == 'india' and a<5:
        df1.loc[len(df1)] = [k.split(',')[0], k.split(',')[1],v]
        a+=1
    elif k.split(',')[0] == 'united states' and b<5:
        df1.loc[len(df1)] = [k.split(',')[0], k.split(',')[1],v]
        b+=1
    elif k.split(',')[0] == 'united kingdom' and c<5:
        df1.loc[len(df1)] = [k.split(',')[0], k.split(',')[1],v]
        c+=1

df1

Unnamed: 0,Country,Genre,Count
0,india,international movies,776
1,united states,documentaries,397
2,india,dramas,382
3,united states,dramas,377
4,united states,comedies,328
5,united states,independent movies,297
6,united states,children & family movies,290
7,india,comedies,254
8,india,dramas,229
9,united kingdom,british tv shows,206


In [None]:
data.iloc[67, 8] = 'R'
data.iloc[2359, 8] = 'TV-14'
data.iloc[3660, 8] = 'TV-PG'
data.iloc[3736, 8] = 'R'
data.iloc[3737, 8] = 'R'
data.iloc[3738, 8] = 'R'
data.iloc[4323, 8] = 'PG-13'

data['age_group'] = data['rating']
MR_age = {'TV-MA': 'Adults',
          'R': 'Adults',
          'PG-13': 'Teens',
          'TV-14': 'Young Adults',
          'TV-PG': 'Older Kids',
          'NR': 'Adults',
          'TV-G': 'Kids',
          'TV-Y': 'Kids',
          'TV-Y7': 'Older Kids',
          'PG': 'Older Kids',
          'G': 'Kids',
          'NC-17': 'Adults',
          'TV-Y7-FV': 'Older Kids',
          'UR': 'Adults'}
data['age_group'] = data['age_group'].map(MR_age)

val = data['age_group'].value_counts().index
cnt = data['age_group'].value_counts().values

fig = go.Figure([go.Bar(x=val, y=cnt, marker_color='darkturquoise')])
fig.update_layout(title_text='Age Group Distribution', title_x=0.5)
fig.show()

In [None]:
data_movie = data[data['type'] == 'Movie']
data_tv = data[data['type'] == 'TV Show']

# create trace 1 that is 3d scatter
trace1 = go.Scatter3d(
    x=data_movie.duration,
    y=data_tv.duration,
    z=data.release_year,
    mode='markers',
    marker_color='darkturquoise'
)

data2 = [trace1]
layout = go.Layout(
)
fig = go.Figure(data=data2, layout=layout)
fig.update_layout(title_text='Distribution of Duration across Movies and TV Show in the past years', title_x=0.5)
iplot(fig)

In [None]:
data_movie = data[data['type'] == 'Movie']
data_tv = data[data['type'] == 'TV Show']

trace0 = go.Box(
    y = data_movie.duration,
    name = "Duration of Movies",
    marker_color='mediumpurple'
)

trace1 = go.Box(
    y = data_tv.duration,
    name = "Duration of TV Shows",
    marker_color='lightcoral'
)

data2 = [trace0,trace1]
iplot(data2)
