In [26]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

In [27]:
# Read the 'Netflix_Engagement_Plus.csv'data file as Netflix_df
Netflix_df = pd.read_excel("Resources/Netflix_Engagement_Report.xlsx", header=5)
Netflix_df = Netflix_df.drop(['Unnamed: 0'], axis=1)

In [28]:
Netflix_df

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed
0,The Night Agent: Season 1,Yes,2023-03-23,812100000
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000
3,Wednesday: Season 1,Yes,2022-11-23,507700000
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000
...,...,...,...,...
18209,راس السنة,No,NaT,100000
18210,心が叫びたがってるんだ。,No,NaT,100000
18211,두근두근 내 인생,No,NaT,100000
18212,라디오 스타,No,NaT,100000


In [29]:
# Clean up Title name and split off OriginalTitle and Year_Country
Netflix2_df = Netflix_df
Netflix2_df['originalTitle'] = Netflix2_df['Title'].str.extract('(?:\/\/)(.{1,})')
Netflix2_df['Title'] = Netflix2_df['Title'].str.replace('\/\/.{1,}', '', regex=True)
Netflix2_df['Title'] = Netflix2_df['Title'].str.replace('\:\sSeason\s\d+', '', regex=True)
Netflix2_df['Year_Country'] = Netflix2_df['Title'].str.extract('(?:\()(.+(?=\)))')
Netflix2_df['Title'] = Netflix2_df['Title'].str.replace('\(.+', '', regex=True)
Netflix2_df['Title'] = Netflix2_df['Title'].str.strip()
Netflix2_df.head()

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,originalTitle,Year_Country
0,The Night Agent,Yes,2023-03-23,812100000,,
1,Ginny & Georgia,Yes,2023-01-05,665100000,,
2,The Glory,Yes,2022-12-30,622800000,더 글로리: 시즌 1,
3,Wednesday,Yes,2022-11-23,507700000,,
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000,,


In [31]:
# Merge, and clean up the Netflix data so that all seasons have their hours viewed combined
df = Netflix2_df[['Title','Hours Viewed']]
df = pd.DataFrame(df.groupby(by='Title').sum())
df = pd.merge(df,Netflix2_df, on='Title', how='left')
df = df.sort_values(by=['Release Date'])
df = df.drop_duplicates(subset=['Title'])
df = df.drop(columns=['Hours Viewed_y'])
df = df.rename(columns={'Hours Viewed_x':'Hours Viewed'})
df['Title'] = df['Title'].str.lower()
df

Unnamed: 0,Title,Hours Viewed,Available Globally?,Release Date,originalTitle,Year_Country
1164,arrested development,41300000,No,2010-04-01,,
16676,trailer park boys,78100000,Yes,2010-09-22,,
7904,la reina del sur,616800000,No,2011-09-05,,
8253,lilyhammer,7800000,No,2012-02-06,,
6400,house of cards,68900000,No,2013-02-01,,
...,...,...,...,...,...,...
18209,레드슈즈,200000,No,NaT,,
18210,비상선언,18600000,No,NaT,,
18211,선생 김봉두,100000,No,NaT,,
18212,침묵,400000,No,NaT,,


In [32]:
# Read the 'Data_Title'data file as title_df
title_df = pd.read_csv("Resources/data_title.tsv",sep = '\t') 
title_df.head()

  title_df = pd.read_csv("Resources/data_title.tsv",sep = '\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [34]:
# Clean up ImDB Title information to remove enneccesary information
Filtered_title_df = title_df[~title_df['titleType'].isin(['videoGame', 'tvPilot', 'tvEpisode'])]
Filtered_title_df = Filtered_title_df.rename(columns={'primaryTitle':'Title'})
Filtered_title_df['Title'] = Filtered_title_df['Title'].str.lower()
Filtered_title_df

Unnamed: 0,tconst,titleType,Title,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,pauvre pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,blacksmith scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10415838,tt9916730,movie,6 gunn,6 Gunn,0,2017,\N,116,Drama
10415848,tt9916754,movie,chico albuquerque - revelações,Chico Albuquerque - Revelações,0,2013,\N,49,Documentary
10415849,tt9916756,short,pretty pretty black girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short
10415853,tt9916764,short,38,38,0,2018,\N,\N,Short


In [35]:
# Merge Netflix and IMDB information
Merged_df = pd.merge(df,Filtered_title_df, on='Title', how='left')
Merged_df

Unnamed: 0,Title,Hours Viewed,Available Globally?,Release Date,originalTitle_x,Year_Country,tconst,titleType,originalTitle_y,isAdult,startYear,endYear,runtimeMinutes,genres
0,arrested development,41300000,No,2010-04-01,,,tt0367279,tvSeries,Arrested Development,0,2003,2019,22,Comedy
1,arrested development,41300000,No,2010-04-01,,,tt0376489,short,Arrested Development,0,2003,\N,6,Short
2,arrested development,41300000,No,2010-04-01,,,tt0901469,movie,Arrested Development,0,\N,\N,\N,Comedy
3,trailer park boys,78100000,Yes,2010-09-22,,,tt0290988,tvSeries,Trailer Park Boys,0,2001,2018,30,"Comedy,Crime"
4,trailer park boys,78100000,Yes,2010-09-22,,,tt0383678,movie,Trailer Park Boys,0,1999,\N,67,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81286,레드슈즈,200000,No,NaT,,,,,,,,,,
81287,비상선언,18600000,No,NaT,,,,,,,,,,
81288,선생 김봉두,100000,No,NaT,,,,,,,,,,
81289,침묵,400000,No,NaT,,,,,,,,,,


In [36]:
# Test dataframe to see how many records are still null and not matching to any record from the IMDB data
test_df = test_df[test_df['tconst'].isnull()]
test_df

Unnamed: 0,Title,Hours Viewed,Available Globally?,Release Date,originalTitle_x,Year_Country,tconst,titleType,originalTitle_y,isAdult,startYear,endYear,runtimeMinutes,genres
123,mako mermaids: an h2o adventure,50600000,No,2013-07-26,,,,,,,,,,
182,ever after high: welcome to ever after high,4600000,Yes,2014-01-31,,,,,,,,,,
183,trailer park boys live in f**kin' dublin,100000,Yes,2014-05-31,,,,,,,,,,
187,jim jefferies : bare,500000,No,2014-08-29,,,,,,,,,,
195,chelsea handler: uganda be kidding me live,200000,No,2014-10-10,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81286,레드슈즈,200000,No,NaT,,,,,,,,,,
81287,비상선언,18600000,No,NaT,,,,,,,,,,
81288,선생 김봉두,100000,No,NaT,,,,,,,,,,
81289,침묵,400000,No,NaT,,,,,,,,,,
