# Project 3 : Binging Netflix Data

## Part 1 --- ETL (Extract, Transform, Load)

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

### Wrangling Data
- Base dataset from kaggle, some entries missing IMDb score;
- Merging with two datasets from IMDb, one contains title+id, one contains score+id, merge on title;

In [2]:
# Read the 'Netflix_Engagement_Plus.csv'data file as engagement_df
engagement_df = pd.read_csv("data/Netflix_Engagement_Plus.csv")
engagement_df.head()

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Number of Ratings,Rating,Genre,Key Words,Description
0,The Night Agent: Season 1,Yes,2023-03-23,812100000,7696.0,6.0,"['Biography', 'Drama', 'History']","persian empire,empire,5th century b.c.,achaeme...",
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000,5216.0,5.7,"['Comedy', 'Drama', 'Romance']","producer,three word title,headstrong,arranged ...",The film follows headstrong Ginny who meets Su...
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000,11869.0,8.4,['Short'],,
3,Wednesday: Season 1,Yes,2022-11-23,507700000,,,['Talk-Show'],youtube video,MsMojo counts down the top 10 Wednesday (2022)...
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000,50077.0,7.4,"['Drama', 'History', 'Romance']","prequel,queen,historical,england,queen charlot...","Betrothed against her will to King George, you..."


In [3]:
# Read the 'data_title.tsv'data file as title_df
title_df = pd.read_csv("data/data_title.tsv",sep = '\t') 
title_df.head()

  title_df = pd.read_csv("data/data_title.tsv",sep = '\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
# Read the 'data.tsv'data file as score_df
score_df = pd.read_csv("data/data.tsv",sep = '\t') 
score_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2008
1,tt0000002,5.7,270
2,tt0000003,6.5,1926
3,tt0000004,5.4,178
4,tt0000005,6.2,2701


In [6]:
# Filtering out the df with missing ratings part, name it 'missing_rating_engagement_df'
missing_rating_engagement_df = engagement_df[engagement_df['Rating'].isnull()]
missing_rating_engagement_df.head(20)

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Number of Ratings,Rating,Genre,Key Words,Description
3,Wednesday: Season 1,Yes,2022-11-23,507700000,,,['Talk-Show'],youtube video,MsMojo counts down the top 10 Wednesday (2022)...
7,Outer Banks: Season 3,Yes,2023-02-23,402500000,,,,,
10,Manifest: Season 4,Yes,2022-11-04,262600000,,,"['Documentary', 'Short']",,
11,Kaleidoscope: Limited Series,Yes,2023-01-01,252500000,,,,,
12,Firefly Lane: Season 2,Yes,2022-12-02,251500000,,,,,
14,Physical: 100: Season 1 // 피지컬: 100: 시즌 1,Yes,2023-01-24,235000000,,,,,
16,Love Is Blind: Season 4,Yes,2023-03-24,229700000,,,,,
20,Fake Profile: Season 1 // Perfil falso: Tempor...,No,2023-05-31,206500000,,,,,
23,"XO, Kitty: Season 1",Yes,2023-05-18,200700000,,,['Short'],,
24,Doctor Cha: Limited Series // 닥터 차정숙: 리미티드 시리즈,Yes,2023-04-15,194700000,,,,,


In [7]:
# Merge title_df and score_df datasets to create score_titile_df with both title and score
score_titile_df = pd.merge(title_df,score_df,on="tconst",how="outer")
score_titile_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2008.0
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.7,270.0
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1926.0
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",5.4,178.0
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2701.0


In [8]:
# Merge score_titile_df with missing_rating_engagement_df
rating_engagement_df = pd.merge(missing_rating_engagement_df,score_titile_df,left_on="Title",right_on="originalTitle",how="inner")
rating_engagement_df.head(20)

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Number of Ratings,Rating,Genre,Key Words,Description,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,AKA,Yes,2023-04-28,120000000,,,,,,tt0317052,movie,AKA,AKA,0,2002,\N,123,"Drama,Romance",6.3,1239.0
1,AKA,Yes,2023-04-28,120000000,,,,,,tt10730418,short,AKA,AKA,0,2018,\N,17,"Adventure,Family,Fantasy",,
2,AKA,Yes,2023-04-28,120000000,,,,,,tt1117442,tvEpisode,AKA,AKA,0,2007,\N,43,"Action,Crime,Drama",8.1,36.0
3,AKA,Yes,2023-04-28,120000000,,,,,,tt13444004,short,AKA,AKA,0,2020,\N,15,Short,,
4,AKA,Yes,2023-04-28,120000000,,,,,,tt13815300,tvEpisode,AKA,AKA,0,2015,\N,\N,"Music,Talk-Show",,
5,AKA,Yes,2023-04-28,120000000,,,,,,tt20200510,tvEpisode,AKA,AKA,0,2019,\N,\N,Reality-TV,,
6,AKA,Yes,2023-04-28,120000000,,,,,,tt27197387,movie,AKA,AKA,0,2023,\N,122,"Action,Crime,Thriller",6.6,12119.0
7,AKA,Yes,2023-04-28,120000000,,,,,,tt4456480,tvEpisode,AKA,AKA,0,2014,\N,\N,Music,,
8,AKA,Yes,2023-04-28,120000000,,,,,,tt6301042,short,AKA,AKA,0,2016,\N,6,"Mystery,Short",,
9,AKA,Yes,2023-04-28,120000000,,,,,,tt7955832,movie,AKA,AKA,0,\N,\N,\N,Thriller,,


In [9]:
# Filtering out the entries with ratings in order to narrow down the wanted matching title
rating_engagement_df = rating_engagement_df[rating_engagement_df['averageRating'].notnull()]
rating_engagement_df.head(20)

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Number of Ratings,Rating,Genre,Key Words,Description,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,AKA,Yes,2023-04-28,120000000,,,,,,tt0317052,movie,AKA,AKA,0,2002,\N,123,"Drama,Romance",6.3,1239.0
2,AKA,Yes,2023-04-28,120000000,,,,,,tt1117442,tvEpisode,AKA,AKA,0,2007,\N,43,"Action,Crime,Drama",8.1,36.0
6,AKA,Yes,2023-04-28,120000000,,,,,,tt27197387,movie,AKA,AKA,0,2023,\N,122,"Action,Crime,Thriller",6.6,12119.0
10,Stranger Things 3,Yes,2019-07-04,67000000,,,"['Short', 'Comedy']",,,tt8046346,tvEpisode,Stranger Things 3,Stranger Things 3,0,2017,\N,\N,"Action,Adventure,Comedy",9.4,16.0
11,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt0360615,movie,The Gray Man,The Gray Man,0,2002,\N,87,"Action,Crime,Drama",7.9,42.0
12,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt0478329,movie,The Gray Man,The Gray Man,0,2007,\N,97,"Biography,Crime,Thriller",5.9,1822.0
14,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt1649418,movie,The Gray Man,The Gray Man,0,2022,\N,122,"Action,Thriller",6.5,229717.0
16,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt21400610,tvEpisode,The Gray Man,The Gray Man,0,2022,\N,\N,Comedy,7.9,14.0
17,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt21433764,tvEpisode,The Gray Man,The Gray Man,0,2022,\N,\N,Comedy,6.9,17.0
19,Blood & Gold,Yes,2023-05-26,51600000,,,,,,tt18073328,movie,Blood & Gold,Blood & Gold,0,2023,\N,98,"Action,Drama,War",6.5,15493.0


In [10]:
# Checking the data types of all columns
rating_engagement_df.dtypes

Title                   object
Available Globally?     object
Release Date            object
Hours Viewed             int64
Number of Ratings      float64
Rating                 float64
Genre                   object
Key Words               object
Description             object
tconst                  object
titleType               object
primaryTitle            object
originalTitle           object
isAdult                 object
startYear               object
endYear                 object
runtimeMinutes          object
genres                  object
averageRating          float64
numVotes               float64
dtype: object

In [11]:
# Converting the data type of 'Release Date' to datetime in order to extract the year
rating_engagement_df['Release Date'] = pd.to_datetime(rating_engagement_df['Release Date'])

In [12]:
# Extracting the year from 'Release Date' to new column called 'release_year'
rating_engagement_df['release_year'] = rating_engagement_df['Release Date'].dt.strftime('%Y')
rating_engagement_df.head(20)

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Number of Ratings,Rating,Genre,Key Words,Description,tconst,...,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,release_year
0,AKA,Yes,2023-04-28,120000000,,,,,,tt0317052,...,AKA,AKA,0,2002,\N,123,"Drama,Romance",6.3,1239.0,2023.0
2,AKA,Yes,2023-04-28,120000000,,,,,,tt1117442,...,AKA,AKA,0,2007,\N,43,"Action,Crime,Drama",8.1,36.0,2023.0
6,AKA,Yes,2023-04-28,120000000,,,,,,tt27197387,...,AKA,AKA,0,2023,\N,122,"Action,Crime,Thriller",6.6,12119.0,2023.0
10,Stranger Things 3,Yes,2019-07-04,67000000,,,"['Short', 'Comedy']",,,tt8046346,...,Stranger Things 3,Stranger Things 3,0,2017,\N,\N,"Action,Adventure,Comedy",9.4,16.0,2019.0
11,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt0360615,...,The Gray Man,The Gray Man,0,2002,\N,87,"Action,Crime,Drama",7.9,42.0,2022.0
12,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt0478329,...,The Gray Man,The Gray Man,0,2007,\N,97,"Biography,Crime,Thriller",5.9,1822.0,2022.0
14,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt1649418,...,The Gray Man,The Gray Man,0,2022,\N,122,"Action,Thriller",6.5,229717.0,2022.0
16,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt21400610,...,The Gray Man,The Gray Man,0,2022,\N,\N,Comedy,7.9,14.0,2022.0
17,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt21433764,...,The Gray Man,The Gray Man,0,2022,\N,\N,Comedy,6.9,17.0,2022.0
19,Blood & Gold,Yes,2023-05-26,51600000,,,,,,tt18073328,...,Blood & Gold,Blood & Gold,0,2023,\N,98,"Action,Drama,War",6.5,15493.0,2023.0


In [13]:
# Keep the scores that has the same 'startYear' and 'release_year'
manual_rating_engagement_df = rating_engagement_df[rating_engagement_df['startYear']==rating_engagement_df['release_year']]
manual_rating_engagement_df.head(20)

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Number of Ratings,Rating,Genre,Key Words,Description,tconst,...,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,release_year
6,AKA,Yes,2023-04-28,120000000,,,,,,tt27197387,...,AKA,AKA,0,2023,\N,122,"Action,Crime,Thriller",6.6,12119.0,2023
14,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt1649418,...,The Gray Man,The Gray Man,0,2022,\N,122,"Action,Thriller",6.5,229717.0,2022
16,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt21400610,...,The Gray Man,The Gray Man,0,2022,\N,\N,Comedy,7.9,14.0,2022
17,The Gray Man,Yes,2022-07-22,58300000,,,,,,tt21433764,...,The Gray Man,The Gray Man,0,2022,\N,\N,Comedy,6.9,17.0,2022
19,Blood & Gold,Yes,2023-05-26,51600000,,,,,,tt18073328,...,Blood & Gold,Blood & Gold,0,2023,\N,98,"Action,Drama,War",6.5,15493.0,2023
23,Tin & Tina,Yes,2023-05-26,49700000,,,,,,tt7354440,...,Tin & Tina,Tin & Tina,0,2023,\N,119,"Horror,Mystery,Thriller",4.7,6598.0,2023
28,Wish Dragon,Yes,2021-06-11,35400000,,,,,,tt5562070,...,Wish Dragon,Wish Dragon,0,2021,\N,98,"Adventure,Animation,Comedy",7.2,32235.0,2021
41,The Haunting of Hill House,Yes,2018-10-12,22000000,,,"['Short', 'Horror']",,,tt6763664,...,The Haunting of Hill House,The Haunting of Hill House,0,2018,2018,572,"Drama,Horror,Mystery",8.6,279517.0,2018
74,Look Both Ways,Yes,2022-08-17,16100000,,,,,,tt14298328,...,Look Both Ways,Look Both Ways,0,2022,\N,110,"Comedy,Drama,Romance",6.3,23246.0,2022
109,Mighty Morphin Power Rangers: Once & Always,No,2023-04-19,12200000,,,,,,tt23219684,...,Mighty Morphin Power Rangers: Once & Always,Mighty Morphin Power Rangers: Once & Always,0,2023,\N,55,"Action,Adventure,Family",5.7,6214.0,2023
