# This notebook is used to explore the datasets

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
#Load data
netflix = pd.read_csv('Data/netflix_titles.csv')
prime = pd.read_csv('Data/amazon_prime_titles.csv')
disney = pd.read_csv('Data/disney_plus_titles.csv')
hulu = pd.read_csv('Data/hulu_titles.csv')
platforms = pd.read_csv('Data/MoviesOnStreamingPlatforms.csv')

In [None]:
# Overview of individual dataframe
netflix.head(1)

In [None]:
# Overview of individual dataframe
disney.head(1)

In [None]:
# Overview of individual datafra
hulu.head(1)

In [None]:
# Overview of individual datafr
prime.sample(10)

In [None]:
print('The amount of titles per platform')
print('netflix:',netflix.shape[0])
print('prime:',prime.shape[0])
print('disney:',disney.shape[0])
print('hulu:',hulu.shape[0])

In [None]:
# Overview of the platform dataframe
platforms

In [None]:
def top_in_attr(dataframe, attribute, count=10):
    df_temp = dataframe.groupby(attribute).count()[['show_id']]
    df_temp.columns = ['count']
    return df_temp.sort_values('count',ascending=False).head(count)

In [None]:
# top 10 netflix directors by count
top_in_attr(netflix,'director')

In [None]:
# Share of Movies vs TV-shows in netflix
top_in_attr(netflix,'type')

In [None]:
top_in_attr(netflix,'country')

In [None]:
top_in_attr(netflix,'release_year')

In [None]:
a = top_in_attr(netflix,'release_year')
b = top_in_attr(hulu,'release_year')
c = top_in_attr(disney,'release_year')
d = top_in_attr(prime,'release_year')
plt.figure(figsize=(15, 7))
release_years = sns.lineplot(x="release_year",
                            y="count",
                            data=a,
                            legend="brief",
                            label='Netflix')
release_years = sns.lineplot(x="release_year",
                            y="count",
                            data=b,
                            label='Hulu',
                            legend="brief")
release_years = sns.lineplot(x="release_year",
                            y="count",
                            data=c,
                            label='Disney+',
                            legend="brief")
release_years = sns.lineplot(x="release_year",
                            y="count",
                            data=d,
                            label='Amazon Prime',
                            legend="brief")
#plt.legend()
#plt.show()
plt.savefig("Netflix_releases_per_year.png")

In [None]:
top_in_attr(netflix,'rating')

In [None]:
# The unique categories
netflix_categories = netflix['listed_in'].str.split(',').explode('listed_in').unique()
netflix_categories

In [None]:
#netflix categories grouped
a = netflix['listed_in'].str.split(',').explode('listed_in')
b = pd.DataFrame(a.groupby(a).count())
b.columns = ['count']
b['category'] = b.index
c = b.sort_values('count', ascending=False)[:20]

In [None]:
plt.figure(figsize=(15, 7))
sns.barplot(x='category', y='count', data=c)
plt.title('20 Largest Netflix Categories')
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Merge together all datasets of netflix, prime, disney and hulu
all_streams=pd.concat([netflix,prime,disney,hulu], keys=['netflix', 'prime','disney','hulu']).reset_index()
all_streams.sample(5)

In [None]:
df = all_streams["level_0"] + " - " + all_streams["type"]
print(df)
sns.countplot(x=df.array)
plt.xticks(rotation=-45, ha="left")
plt.savefig("Amount_of_content.png",dpi=600, bbox_inches = "tight")

### Adding information from IMDb

In [None]:
all_rated = pd.read_csv("Data/all_IMDB_rated.csv")
all_rated.sample(2)

In [None]:
df = all_rated["level_0"] + " - " + all_rated["type"]
print(df)
sns.countplot(x=df.array)
plt.xticks(rotation=-45, ha="left")
plt.savefig("Amount_of_content.png",dpi=600, bbox_inches = "tight")

In [None]:
imdb = pd.read_csv("Data/title.basics.tsv", sep="\t",low_memory=False)
imdb.shape

In [None]:
all_streams_imdb=pd.merge(all_streams,imdb,left_on=["title"],right_on=["originalTitle"],how='left')
all_streams_imdb.shape

In [None]:
all_streams_imdb=all_streams_imdb[all_streams_imdb["startYear"].astype("string")==all_streams_imdb["release_year"].astype("string")]
all_streams_imdb.shape

In [None]:
all_streams_imdb.sample(5)

In [None]:
rating = pd.read_csv('Data/title.ratings.tsv', sep="\t")

In [None]:
all_rated=pd.merge(all_streams_imdb,rating,on="tconst",how="left")

In [None]:
all_rated.to_csv("Data/all_IMDB_rated.csv",index=False)

### Using the merged dataset

In [None]:
ratings = all_rated[["level_0","averageRating"]]

plt.figure(figsize=(15, 7))
sns.histplot(data=ratings,binwidth=0.5,x="averageRating",hue="level_0",)
plt.title('Average IMDB Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()