# This notebook is used to explore the datasets

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
#Load data
netflix = pd.read_csv('Data/netflix_titles.csv')
prime = pd.read_csv('Data/amazon_prime_titles.csv')
disney = pd.read_csv('Data/disney_plus_titles.csv')
hulu = pd.read_csv('Data/hulu_titles.csv')
platforms = pd.read_csv('Data/MoviesOnStreamingPlatforms.csv')

In [None]:
# Overview of individual dataframe
netflix.head(1)

In [None]:
# Overview of individual dataframe
disney.head(1)

In [None]:
# Overview of individual datafra
hulu.shape

In [None]:
# Overview of individual datafr
prime.sample(10)

In [None]:
print('The amount of titles per platform')
print('netflix:',netflix.shape[0])
print('prime:',prime.shape[0])
print('disney:',disney.shape[0])
print('hulu:',hulu.shape[0])

In [None]:
# Overview of the platform dataframe
platforms

In [None]:
def top_in_attr(dataframe, attribute, count=10):
    df_temp = dataframe.groupby(attribute).count()[['show_id']]
    df_temp.columns = ['count']
    return df_temp.sort_values('count',ascending=False).head(count)

In [None]:
# top 10 netflix directors by count
top_in_attr(netflix,'director')

In [None]:
# Share of Movies vs TV-shows in netflix
top_in_attr(netflix,'type')

In [None]:
top_in_attr(netflix,'country')

In [None]:
top_in_attr(netflix,'release_year')

In [None]:
a = top_in_attr(netflix,'release_year')
b = top_in_attr(hulu,'release_year')
c = top_in_attr(disney,'release_year')
d = top_in_attr(prime,'release_year')
plt.figure(figsize=(15, 7))
release_years = sns.lineplot(x="release_year",
                            y="count",
                            data=a,
                            legend="brief",
                            label='Netflix')
release_years = sns.lineplot(x="release_year",
                            y="count",
                            data=b,
                            label='Hulu',
                            legend="brief")
release_years = sns.lineplot(x="release_year",
                            y="count",
                            data=c,
                            label='Disney+',
                            legend="brief")
release_years = sns.lineplot(x="release_year",
                            y="count",
                            data=d,
                            label='Amazon Prime',
                            legend="brief")
#plt.legend()
#plt.show()
plt.savefig("Releases_per_year.png")

In [None]:
top_in_attr(netflix,'rating')

In [None]:
#Merge together all datasets of netflix, prime, disney and hulu
all_streams=pd.concat([netflix,prime,disney,hulu], keys=['Netflix', 'Prime','Disney','Hulu']).reset_index()
all_streams.sample(10)

In [None]:
director = all_streams[["level_0","director"]]
director = director.rename(columns={"level_0":"service"})
director["director"] = director["director"].str.split(",")
director = director.explode("director")
director["director"] = director["director"].str.strip()
cnt = director.value_counts().reset_index()
cnt = cnt.rename(columns={0:"cnt"})
cnt = pd.concat([cnt,imdb_director_hulu_count],ignore_index=True)
cnt = cnt.sort_values("cnt",ascending=False)
cnt.to_csv("Data/director_per_platform.csv")
cnt.sample(40)

In [None]:
cast = all_streams[["level_0","cast"]]
cast=cast.rename(columns={"level_0":"service"})
cast["cast"] = cast["cast"].str.split(",")
cast = cast.explode("cast")
cast["cast"] = cast["cast"].str.strip()
cnt = cast.value_counts().reset_index()
cnt = cnt.rename(columns={0:"cnt"})
cnt = pd.concat([cnt,imdb_cast_hulu_count],ignore_index=True)
cnt = cnt.sort_values("cnt",ascending=False)
cnt.to_csv("Data/cast_per_platform.csv")
cnt.sample(40)

In [None]:
df = all_streams["level_0"] + " - " + all_streams["type"]
print(df)
sns.countplot(x=df.array)
plt.xticks(rotation=-45, ha="left")
plt.savefig("Amount_of_content.png",dpi=600, bbox_inches = "tight")

### Adding information from IMDb

In [None]:
all_rated = pd.read_csv("Data/all_IMDB_rated.csv")
all_rated.sample(2)

In [None]:
df = all_rated["level_0"] + " - " + all_rated["type"]
print(df)
sns.countplot(x=df.array)
plt.xticks(rotation=-45, ha="left")
plt.savefig("Amount_of_content.png",dpi=600, bbox_inches = "tight")

In [None]:
imdb = pd.read_csv("Data/title.basics.tsv", sep="\t",low_memory=False)
imdb.shape

In [None]:
imdb.sample(1)

In [None]:
hulu_imdb=pd.merge(hulu,imdb,left_on=["title"],right_on=["originalTitle"],how='left')
hulu_imdb.shape

In [None]:
hulu_imdb=hulu_imdb[hulu_imdb["startYear"].astype("string")==hulu_imdb["release_year"].astype("string")]
hulu_imdb.shape

In [None]:
hulu_imdb2=hulu_imdb[(hulu_imdb["title"].str.count(" ")>3) |((hulu_imdb["type"].astype("string")=="Movie") & (hulu_imdb["titleType"].astype("string")=="movie")) | ((hulu_imdb["type"].astype("string")=="TV Show") & (hulu_imdb["titleType"].astype("string")=="tvSeries")) ]
hulu_imdb2.shape

In [None]:
hulu_imdb2.sample(5)

In [None]:
imdb_cast = pd.read_csv("Data/title.principals.tsv", sep="\t",low_memory=False)


In [None]:
imdb_cast.head(3)

In [None]:
imdb_cast.shape

In [None]:
imdb_cast_hulu = pd.merge(hulu_imdb2,imdb_cast[imdb_cast["category"].astype("string")=="actor"],left_on=["tconst"],right_on=["tconst"],how='left')
imdb_cast_hulu.head(20)

In [None]:
imdb_director_hulu = pd.merge(hulu_imdb2,imdb_cast[imdb_cast["category"].astype("string")=="director"],left_on=["tconst"],right_on=["tconst"],how='left')
imdb_director_hulu.head(2)

In [None]:
imdb_name = pd.read_csv("Data/name.basics.tsv", sep="\t",low_memory=False)
imdb_name.shape

In [None]:
imdb_cast_hulu_names = pd.merge(imdb_cast_hulu,imdb_name,left_on="nconst",right_on="nconst",how="left")
imdb_cast_hulu_names.sample(10)

In [138]:
all_columns = list(hulu.columns)
all_columns.append("primaryName")


columns_group = list(hulu.columns)
print("all_collumns: ",all_columns)
print("columns_group: ",columns_group)
t_c = imdb_cast_hulu_names[all_columns]
t_c.head(5)

t_d = imdb_director_hulu_names[all_columns]
t_d.head(5)


all_collumns:  ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'primaryName']
columns_group:  ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description']


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,primaryName
0,s2,Movie,Silent Night,,,,"October 23, 2021",2020,,94 min,"Crime, Drama, Thriller","Mark, a low end South London hitman recently r...",Will Thorne
1,s3,Movie,The Marksman,,,,"October 23, 2021",2021,PG-13,108 min,"Action, Thriller",A hardened Arizona rancher tries to protect an...,Robert Lorenz
2,s4,Movie,Gaia,,,,"October 22, 2021",2021,R,97 min,Horror,A forest ranger and two survivalists with a cu...,Jaco Bouwer
3,s5,Movie,Settlers,,,,"October 22, 2021",2021,,104 min,"Science Fiction, Thriller",Mankind's earliest settlers on the Martian fro...,Wyatt Rockefeller
4,s8,TV Show,The Next Thing You Eat,,,,"October 21, 2021",2021,,1 Season,"Cooking & Food, Documentaries, Lifestyle & Cul...",With the unique insights and experience of Ugl...,


In [158]:

df_c = t_c.groupby("show_id")["primaryName"].apply(list).reset_index()
hulu_new = pd.merge(hulu,df_c,on="show_id",how="left")
hulu_new["cast"]=hulu_new["primaryName"]
hulu_new.drop("primaryName",inplace=True,axis=1)
hulu_new.sample(5)

df_d = t_d.drop_duplicates(columns_group)[["show_id","primaryName"]]
hulu_new = pd.merge(hulu_new,df_d,on="show_id",how="left")
hulu_new["director"]=hulu_new["primaryName"]
hulu_new.drop("primaryName",inplace=True,axis=1)

hulu_new.replace([np.nan],np.nan,inplace=True)
hulu_new.sample(5)

hulu_new["cast"] = hulu_new["cast"].str.join(", ")
hulu_new.to_csv("Data/hulu_title_joines_with_IMDB.csv")

In [142]:
imdb_director_hulu_names = pd.merge(imdb_director_hulu,imdb_name,left_on="nconst",right_on="nconst",how="left")
print(imdb_director_hulu_names.shape)
imdb_director_hulu_names.sample(10)


(1971, 31)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,ordering,nconst,category,job,characters,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
977,s1385,TV Show,If Loving You Is Wrong,,,United States,"September 23, 2020",2014,TV-MA,5 Seasons,...,,,,,,,,,,
1047,s1466,Movie,Elena Undone,,,United States,"August 1, 2020",2010,111 min,,...,5.0,nm0174903,director,\N,\N,Nicole Conn,1959,\N,"director,writer,editor","tt7326320,tt1575539,tt0103977,tt0430303"
1515,s2217,Movie,Don't Go,,,Ireland,"April 24, 2019",2018,92 min,,...,5.0,nm1216554,director,\N,\N,David Gleeson,1966,\N,"writer,director,producer","tt0377701,tt3339680,tt3361792,tt0488121"
1323,s1899,TV Show,The Curse of Oak Island,,,United States,"January 1, 2020",2014,TV-PG,6 Seasons,...,,,,,,,,,,
1939,s3004,TV Show,A Day in the Life,,,United States,"March 12, 2012",2011,TV-14,2 Seasons,...,3.0,nm2202756,director,\N,\N,Eon Song,\N,\N,"editorial_department,actress,producer","tt10529908,tt6432466,tt8494916"
568,s809,Movie,A Christmas Hero,,,,"April 8, 2021",2020,PG-13,86 min,...,5.0,nm0943445,director,\N,\N,Phil Wurtzel,\N,\N,"producer,director,writer","tt3268790,tt6613412,tt7235038,tt10399114"
282,s402,Movie,Kingpin,,,,"August 1, 2021",1996,TV-14,114 min,...,5.0,nm0125803,director,\N,\N,Bobby Farrelly,1958,\N,"producer,director,writer","tt0183505,tt0129387,tt0181739,tt0256380"
134,s182,Movie,Gemini,,,United States,"September 25, 2021",2017,R,93 min,...,5.0,nm1369800,director,\N,\N,Aaron Katz,1981,\N,"director,writer,editor","tt3283556,tt5795086,tt1497874,tt0914382"
106,s141,Movie,The Hunger Games: Catching Fire,,,,"October 1, 2021",2013,TV-14,182 min,...,,,,,,,,,,
956,s1355,TV Show,Halloween Baking Championship,,,,"October 1, 2020",2015,TV-PG,4 Seasons,...,,,,,,,,,,


In [None]:
imdb_cast_hulu_names["primaryName"].sample(10)

In [None]:
imdb_cast_hulu_count = imdb_cast_hulu_names["primaryName"].value_counts().reset_index()
imdb_cast_hulu_count= imdb_cast_hulu_count.rename(columns={"index":"cast","primaryName":"cnt"})
imdb_cast_hulu_count["service"]="Hulu"
imdb_cast_hulu_count.sample(10)

In [None]:
imdb_director_hulu_count = imdb_director_hulu_names["primaryName"].value_counts().reset_index()
imdb_director_hulu_count= imdb_director_hulu_count.rename(columns={"index":"director","primaryName":"cnt"})
imdb_director_hulu_count["service"]="Hulu"
imdb_director_hulu_count.sample(10)

In [None]:
rating = pd.read_csv('Data/title.ratings.tsv', sep="\t")

In [None]:
all_rated=pd.merge(all_streams_imdb,rating,on="tconst",how="left")

In [None]:
all_rated.to_csv("Data/all_IMDB_rated.csv",index=False)

### Using the merged dataset

In [None]:
ratings = all_rated[["level_0","averageRating"]]

plt.figure(figsize=(15, 7))
sns.histplot(data=ratings,binwidth=0.5,x="averageRating",hue="level_0",)
plt.title('Average IMDB Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

In [None]:
# The unique categories of netflix
netflix_categories = netflix['listed_in'].str.split(', ').explode('listed_in').unique()
netflix_categories

In [None]:
# categories grouped
def get_cat(df, number=20):
    a = df['listed_in'].str.split(', ').explode('listed_in')
    b = pd.DataFrame(a.groupby(a).count())
    b.columns = ['count']
    b['category'] = b.index
    return b.sort_values('count', ascending=False)[:number]

In [None]:
plt.figure(figsize=(15, 7))
sns.barplot(x='category', y='count', data=get_cat(netflix))
plt.title('20 Largest Netflix Categories')
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.savefig('20_Largest_Netflix_Categories.png')
plt.show()

In [None]:
net_a = get_cat(netflix, 30)
pr_a = get_cat(prime, 30)
hul_a = get_cat(hulu, 30)
dis_a = get_cat(disney, 30)

In [None]:
net_a

In [None]:
net_a['provider'] = 'Netflix'
pr_a['provider'] = 'Prime'
hul_a['provider'] = 'Hulu'
dis_a['provider'] = 'Disney'

In [None]:
big_cat = pd.concat((net_a, pr_a, hul_a, dis_a))

In [None]:
import plotly.express as px
fig = px.treemap(big_cat, path=['provider', 'category'], values='count')
fig.show()