In [4]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [5]:
movies = pd.read_csv('netflix_titles-2.csv')
movies.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [6]:
len(movies)

7787

In [41]:
movies.drop_duplicates(inplace=True)

In [42]:
len(movies) # Tells me there are no duplicate rows. Great! 

7787

### Total missing values from every column

In [10]:
movies.isnull().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

#### What kind of data is really missing? 

- US & UK shows have the maximum empty rows for directors 
- Missing data is distributed among both TV shows and Movies but more number of TV shows do not have a director listed in the data than in the movies. 
        - However, if you think about it - there could be more number of TV shows in the data than there are movies. So, let's take a % of missing data per total TV shows. 
        - After looking at all data, it turns out TV shows have more missing director data than Movies. 

In [22]:
df = pd.pivot_table(data=movies[movies['director'].isnull()==True], index='country',  values= 'show_id', aggfunc='count')

In [23]:
df.sort_values(by="show_id", ascending=False)

Unnamed: 0_level_0,show_id
country,Unnamed: 1_level_1
United States,717
United Kingdom,196
Japan,148
South Korea,139
India,75
Taiwan,63
Canada,62
Australia,45
France,42
Spain,37


In [26]:
df = pd.pivot_table(data=movies[movies['director'].isnull()==True], index='country', columns='type',  values= 'show_id', aggfunc='count')

In [28]:
df

type,Movie,TV Show
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,,14.0
"Argentina, Spain",1.0,1.0
"Argentina, United States, Mexico",,1.0
Australia,,45.0
"Australia, Canada",,1.0
"Australia, New Zealand",,1.0
"Australia, New Zealand, United States",,1.0
"Australia, United States",,3.0
"Austria, Germany",,1.0
Belarus,,1.0


In [30]:
df.sort_values(by="TV Show", ascending=False)

type,Movie,TV Show
country,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,54.0,663.0
United Kingdom,13.0,183.0
Japan,1.0,147.0
South Korea,2.0,137.0
India,12.0,63.0
Taiwan,1.0,62.0
Canada,6.0,56.0
Australia,,45.0
France,,42.0
Spain,2.0,35.0


In [31]:
# Reflects only missing data values 
pd.pivot_table(data=movies[movies['director'].isnull()==True],  columns='type',  values= 'show_id', aggfunc='count')

type,Movie,TV Show
show_id,163,2226


In [33]:
# Reflects data with NO missing director field. 
pd.pivot_table(data=movies[movies['director'].isnull()==False],  columns='type',  values= 'show_id', aggfunc='count')

type,Movie,TV Show
show_id,5214,184


In [24]:
df = pd.pivot_table(data=movies[movies['cast'].isnull()==True], index='country',  values= 'show_id', aggfunc='count')

In [25]:
df.sort_values(by="show_id", ascending=False)

Unnamed: 0_level_0,show_id
country,Unnamed: 1_level_1
United States,304
United Kingdom,65
India,29
Canada,18
Spain,17
France,16
Australia,9
Brazil,9
Mexico,8
Germany,6


In [37]:
movies['release_year'].sort_values(ascending=True)

4867    1925
6117    1942
4960    1942
7679    1943
7342    1943
7616    1943
7268    1944
6699    1944
6657    1944
5371    1945
3425    1945
4436    1945
3608    1946
4866    1946
7072    1947
7595    1954
6141    1954
6868    1955
5425    1955
5122    1955
1620    1956
2233    1956
1241    1958
2410    1958
1169    1958
7302    1959
3187    1960
4563    1960
5624    1960
1152    1960
4300    1962
3707    1962
4981    1962
5351    1963
6953    1963
5376    1964
6221    1965
1793    1965
480     1966
7538    1967
6082    1967
1041    1967
1497    1967
7609    1967
1344    1968
5285    1968
6793    1968
4635    1968
5785    1968
4970    1969
7242    1969
6255    1970
6544    1970
1970    1971
7628    1971
165     1971
3525    1971
2152    1971
770     1972
5907    1972
4216    1972
3442    1972
4008    1973
7737    1973
6850    1973
1283    1973
1414    1973
1016    1973
3176    1973
3131    1973
2013    1973
3358    1973
6613    1974
4217    1974
392     1974
2953    1974
3915    1974

### What does release year look like?

- Movies data ranges from 1925 release year to 2021

In [38]:
movies['release_year'].min()

1925

In [39]:
movies['release_year'].max()

2021

In [40]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
show_id         7787 non-null object
type            7787 non-null object
title           7787 non-null object
director        5398 non-null object
cast            7069 non-null object
country         7280 non-null object
date_added      7777 non-null object
release_year    7787 non-null int64
rating          7780 non-null object
duration        7787 non-null object
listed_in       7787 non-null object
description     7787 non-null object
dtypes: int64(1), object(11)
memory usage: 730.1+ KB


### What numerical data do we have in this dataset?

In [44]:
movies.describe() # Just release year

Unnamed: 0,release_year
count,7787.0
mean,2013.93258
std,8.757395
min,1925.0
25%,2013.0
50%,2017.0
75%,2018.0
max,2021.0


In [45]:
movies.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."


- How about I create some columns for myself from this categorical data 

In [47]:
movies['listed_in'].unique()

array(['International TV Shows, TV Dramas, TV Sci-Fi & Fantasy',
       'Dramas, International Movies',
       'Horror Movies, International Movies',
       'Action & Adventure, Independent Movies, Sci-Fi & Fantasy',
       'Dramas', 'International TV Shows, TV Dramas, TV Mysteries',
       'Horror Movies, International Movies, Thrillers',
       'Dramas, Thrillers',
       'Crime TV Shows, International TV Shows, TV Dramas',
       'Crime TV Shows, Docuseries, International TV Shows',
       'Documentaries, International Movies, Sports Movies',
       'Independent Movies, Sci-Fi & Fantasy, Thrillers',
       'Dramas, International Movies, Thrillers',
       'International TV Shows, TV Dramas',
       'Comedies, Dramas, Independent Movies', 'Sports Movies',
       'Dramas, Independent Movies, International Movies',
       'Action & Adventure, Dramas, International Movies',
       'Anime Series, International TV Shows', 'Documentaries',
       'Reality TV', 'Documentaries, International

In [48]:
movies['description'].unique()

array([ 'In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.',
       'After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive.',
       "When an army recruit is found dead, his fellow soldiers are forced to confront a terrifying secret that's haunting their jungle island training camp.",
       ...,
       "In this documentary, South African rapper Nasty C hits the stage and streets of Tokyo, introducing himself to the city's sights, sounds and culture.",
       'Dessert wizard Adriano Zumbo looks for the next “Willy Wonka” in this tense competition that finds skilled amateurs competing for a $100,000 prize.',
       'This documentary delves into the mystique behind the blues-rock trio and explores how the enigmatic band created their iconic look and sound.'], dtype=object)

In [49]:
movies['duration'].unique()

array(['4 Seasons', '93 min', '78 min', '80 min', '123 min', '1 Season',
       '95 min', '119 min', '118 min', '143 min', '103 min', '89 min',
       '91 min', '149 min', '144 min', '124 min', '87 min', '110 min',
       '128 min', '117 min', '100 min', '2 Seasons', '84 min', '99 min',
       '90 min', '102 min', '104 min', '105 min', '56 min', '125 min',
       '81 min', '97 min', '106 min', '107 min', '109 min', '44 min',
       '75 min', '101 min', '3 Seasons', '37 min', '113 min', '114 min',
       '130 min', '94 min', '140 min', '135 min', '82 min', '70 min',
       '121 min', '92 min', '164 min', '53 min', '83 min', '116 min',
       '86 min', '120 min', '96 min', '126 min', '129 min', '77 min',
       '137 min', '148 min', '28 min', '122 min', '176 min', '85 min',
       '22 min', '68 min', '111 min', '29 min', '142 min', '168 min',
       '21 min', '59 min', '20 min', '98 min', '108 min', '76 min',
       '26 min', '156 min', '30 min', '57 min', '150 min', '133 min',
       '1