# Data Exploration

In [None]:
%cd ..

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns; sns.set_theme(style = "dark")
%matplotlib inline
import matplotlib.pyplot as plt

### Loading data sets

In [None]:
# Loading IMDb dataset
imdb_data = pd.read_csv("cumulative-data/tv_dataset.csv")
# Loading top 1000 episodes on IMDb
top_episodes = pd.read_csv("cumulative-data/top1000_episodes_imdb.csv")
# Loading top 250 TV shows on IMDb data set
top_250 = pd.read_csv("cumulative-data/IMDb_top_250.csv")

In [None]:
imdb_data.head()

In [None]:
len(imdb_data)

In [None]:
imdb_data = imdb_data[imdb_data["rating"] > 0]

In [None]:
len(imdb_data)

In [None]:
top_episodes.head()

In [None]:
top_250.head()

### Top 10 shows by ratings and votes

In [None]:
top_10_ratings = top_250.loc[:9, ["title", "rating"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "title", x = "rating", data = top_10_ratings, color = "blue")
plt.title("Top 10 rated TV shows on IMDb", fontsize = 18)
plt.ylabel("", fontsize = 14)
plt.xlabel("Rating", fontsize = 14)
plt.xlim(8.8,9.6)
plt.show();

In [None]:
top_votes = top_250.sort_values("total_votes", ascending = False).reset_index(drop=True)
top_10_votes = top_votes.loc[:9, ["title", "total_votes"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "title", x = "total_votes", data = top_10_votes, color = "blue")
plt.title("Top 10 TV shows with the most votes on IMDb", fontsize = 18)
plt.ylabel("", fontsize = 14)
plt.xlabel("Total Votes", fontsize = 14)
#plt.xticks(rotation = 60)
#plt.ylim(8.8,9.6)
plt.show();

### Top 10 shows by ratings and votes - Aggregation by episode

In [None]:
mean_ratings = imdb_data.groupby(["tv", "encoded_title"], as_index = False)["rating"].mean()

In [None]:
mean_ratings_sorted = mean_ratings.sort_values("rating", ascending = False).reset_index(drop =True)

In [None]:
top10_mean_ratings = mean_ratings_sorted.loc[:9, ["tv", "rating"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv", x = "rating", data = top10_mean_ratings, color = "red")
plt.title("Top 10 rated TV shows on IMDb (average episode ratings)", fontsize = 18)
plt.ylabel("", fontsize = 14)
plt.xlabel("Average Rating Per Episode", fontsize = 14)
#plt.xticks(rotation = 60)
plt.xlim(8,10)
plt.show();

In [None]:
mean_votes = imdb_data.groupby(["tv", "encoded_title"], as_index = False)["total_votes"].mean()
mean_votes_sorted = mean_votes.sort_values("total_votes", ascending = False).reset_index(drop = True)
top10_total_votes = mean_votes_sorted.loc[:9, ["tv", "total_votes"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv", x = "total_votes", data = top10_total_votes, color = "red")
plt.title("Top 10 TV shows with the most votes on IMDb per episode", fontsize = 18)
plt.ylabel("", fontsize = 14)
plt.xlabel("Average Votes Per Episode", fontsize = 14)
#plt.xticks(rotation = 60)
#plt.ylim(8.8,9.6)
plt.show();

### Top 10 seasons by ratings and votes

In [None]:
season_ratings = imdb_data.groupby(["tv", "encoded_title", "season"], as_index = False)["rating"].mean()

In [None]:
season_ratings["season"] = season_ratings["season"].astype(str)

In [None]:
season_ratings["tv_season"] = season_ratings["tv"] + " " + season_ratings["season"]

In [None]:
season_ratings = season_ratings.sort_values("rating", ascending = False).reset_index()
season_ratings_top10 = season_ratings.loc[:9, ["tv_season", "rating"]]

In [None]:
season_ratings_top10.to_csv("data-for-charts/season_ratings_top10.csv", index = False)

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv_season", x = "rating", data = season_ratings_top10, color = "blue")
plt.title("Top 10 rated TV seasons on IMDb", fontsize = 18)
plt.ylabel("", fontsize = 14)
plt.xlabel("Average Rating Per Episode", fontsize = 14)
#plt.xticks(rotation = 60)
plt.xlim(9,10)
plt.show();

In [None]:
season_votes = imdb_data.groupby(["tv", "encoded_title", "season"], as_index = False)["total_votes"].mean()

In [None]:
season_votes["season"] = season_votes["season"].astype(str)

In [None]:
season_votes["tv_season"] = season_votes["tv"] + " " + season_votes["season"]

In [None]:
season_votes = season_votes.sort_values("total_votes", ascending = False).reset_index()
season_votes_top10 = season_votes.loc[:9, ["tv_season", "total_votes"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv_season", x = "total_votes", data = season_votes_top10, color = "red")
plt.title("Top 10 TV seasons with the most votes on IMDb", fontsize = 18)
plt.xlabel("TV show", fontsize = 14)
plt.ylabel("Average Votes Per Episode", fontsize = 14)
#plt.xticks(rotation = 60)
#plt.ylim(8.8,9.6)
plt.show();

### Comparison between IMDb ratings and IMDb ratings by episode

In [None]:
top_250_new = top_250.merge(mean_ratings, on = "encoded_title")

In [None]:
top_250_new = top_250_new.drop('tv', axis = 1)

In [None]:
top_250_new = top_250_new[["title", "rating_x", "rating_y", "total_votes", "encoded_title"]]

In [None]:
plt.figure(figsize = (20,12))
sns.scatterplot(x = "rating_x", y = "rating_y", data = top_250_new, color = "blue")
plt.title("Average Ratings by Episodes vs IMDb Ratings", fontsize = 22)
plt.xlabel("IMDb Ratings", fontsize = 16)
plt.ylabel("Average Ratings by Episodes", fontsize =16)
plt.show();

### Which TV show produces the best episodes?

In [None]:
best_episodes = top_episodes.groupby(["tv_show", "encoded_title"], as_index = False)["rating"].count()

In [None]:
best_episodes = best_episodes.rename(columns={"rating":"rating_count"})

In [None]:
best_episodes = best_episodes.sort_values("rating_count", ascending = False).reset_index(drop = True)
best_episodes_top10 = best_episodes.loc[:9,:]

In [None]:
best_episodes_top10.to_csv("data-for-charts/best_episodes_top10.csv", index = False)

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv_show", x = "rating_count", data = best_episodes_top10, color = "blue")
plt.title("Top 10 shows with the most episodes among the best episodes", fontsize = 18)
plt.ylabel("", fontsize = 14)
plt.xlabel("Frequency", fontsize = 14)
plt.show();

In [None]:
imdb_episodes_count = imdb_data.groupby(["tv", "encoded_title"], as_index = False)["rating"].count()

In [None]:
imdb_episodes_count = imdb_episodes_count.rename(columns={"rating":"episode_count"})

In [None]:
best_episodes_prop = imdb_episodes_count.merge(best_episodes, on = "encoded_title")

In [None]:
best_episodes_prop

In [None]:
best_episodes_prop["rating_prop"] = best_episodes_prop["rating_count"]/best_episodes_prop["episode_count"]
best_episodes_prop = best_episodes_prop.sort_values("rating_prop", ascending = False).reset_index(drop = True)
best_episodes_prop = best_episodes_prop[best_episodes_prop["episode_count"] >= 30].reset_index(drop = True)
best_episodes_prop_top10 = best_episodes_prop.loc[:9,:]

In [None]:
best_episodes_prop_top10

In [None]:
best_episodes_prop_top10.to_csv("data-for-charts/best_episodes_prop_top10.csv", index = False)

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv", x = "rating_prop", data = best_episodes_prop_top10, color = "red")
plt.title("Top 10 shows with the highest proportions of their episodes among the best episode (episodes >= 30)", fontsize = 18)
plt.ylabel("", fontsize = 14)
plt.xlabel("Proportion", fontsize = 14)
plt.show();

### The Big Shows

In [None]:
imdb_data_big4 = imdb_data[imdb_data["tv"].isin(["Game of Thrones", "Breaking Bad", "The Wire", "The Sopranos"])]

In [None]:
imdb_data_big4["tv"].unique()

In [None]:
imdb_data_big4

In [None]:
imdb_data_big4.to_csv("data-for-charts/imdb_data_big4.csv", index = False)

In [None]:
plt.figure(figsize= (18,10))
sns.boxplot(x = "tv", y = "rating", data = imdb_data_big4)
plt.title("Distribution of ratings across the big shows", fontsize = 18)
#plt.xlabel("Ratings", fontsize = 16)
#plt.xlim(6,11)
plt.xlabel("TV show", fontsize = 14)
plt.ylabel("Rating", fontsize = 14)
plt.show();

### Genre Analysis top 1000 episodes

In [None]:
top_episodes_collect = top_episodes[["episode", "rating", "genre1", "genre2", "genre3"]]

In [None]:
top_episodes_renew = top_episodes_collect.melt(id_vars = ["episode", "rating"], value_vars = ["genre1", "genre2", "genre3"], var_name = "genres", value_name = "genre")

In [None]:
top_episodes_renew = top_episodes_renew.drop("genres", axis = 1)

In [None]:
top_episodes_renew = top_episodes_renew.dropna()

In [None]:
top_episodes_renew.info()

In [None]:
top_episodes_renew["genre"].value_counts()

In [None]:
top_episodes_votes = top_episodes[["tv_show", "episode", "rating", "year", "total_votes"]].sort_values("total_votes", ascending = False).reset_index(drop = True)

In [None]:
top_episodes_votes.loc[:9,]

In [None]:
imdb_data

In [None]:
imdb_data_episodes_votes = imdb_data[["tv", "season", "episode_number", "title", "episode_desc", "rating", "total_votes"]].sort_values("total_votes", ascending = False).reset_index(drop = True)

In [None]:
imdb_data_episodes_votes.loc[:9,:]