# Data Exploration

In [None]:
%cd ..

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns; sns.set_theme(style = "dark")
%matplotlib inline
import matplotlib.pyplot as plt

### Loading data sets

In [None]:
imdb_data = pd.read_csv("cumulative-data/tv_dataset.csv")
top_episodes = pd.read_csv("cumulative-data/top1000_episodes_imdb.csv")
top_250 = pd.read_csv("cumulative-data/IMDb_top_250.csv")

In [None]:
imdb_data.head()

In [None]:
top_episodes.head()

In [None]:
top_250.head()

### Top 20 shows by ratings and votes

In [None]:
top_20_ratings = top_250.loc[:20, ["title", "rating"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "title", x = "rating", data = top_20_ratings, color = "blue")
plt.title("Top 10 rated TV shows on IMDb", fontsize = 22)
plt.xlabel("TV show", fontsize = 16)
plt.ylabel("Rating", fontsize = 16)
#plt.xticks(rotation = 60)
plt.xlim(8.8,9.6)
plt.show();

In [None]:
top_votes = top_250.sort_values("total_votes", ascending = False).reset_index(drop=True)
top_20_votes = top_votes.loc[:20, ["title", "total_votes"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "title", x = "total_votes", data = top_20_votes, color = "red")
plt.title("Top 20 TV shows with the most votes on IMDb", fontsize = 22)
plt.xlabel("TV show", fontsize = 16)
plt.ylabel("Total Votes", fontsize = 16)
#plt.xticks(rotation = 60)
#plt.ylim(8.8,9.6)
plt.show();

### Top 20 shows by ratings and votes - Aggregation by episode

In [None]:
mean_ratings = imdb_data.groupby(["tv", "encoded_title"], as_index = False)["rating"].mean()

In [None]:
mean_ratings_sorted = mean_ratings.sort_values("rating", ascending = False).reset_index(drop =True)

In [None]:
top20_mean_ratings = mean_ratings_sorted.loc[:20, ["tv", "rating"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv", x = "rating", data = top20_mean_ratings, color = "blue")
plt.title("Top 20 rated TV shows on IMDb (average episode ratings)", fontsize = 22)
plt.xlabel("TV show", fontsize = 16)
plt.ylabel("Average Rating Per Episode", fontsize = 16)
#plt.xticks(rotation = 60)
plt.xlim(8,10)
plt.show();

In [None]:
mean_votes = imdb_data.groupby(["tv", "encoded_title"], as_index = False)["total_votes"].mean()
mean_votes_sorted = mean_votes.sort_values("total_votes", ascending = False).reset_index(drop = True)
top20_total_votes = mean_votes_sorted.loc[:20, ["tv", "total_votes"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv", x = "total_votes", data = top20_total_votes, color = "red")
plt.title("Top 20 TV shows with the most votes on IMDb per episode", fontsize = 22)
plt.xlabel("TV show", fontsize = 16)
plt.ylabel("Average Votes Per Episode", fontsize = 16)
#plt.xticks(rotation = 60)
#plt.ylim(8.8,9.6)
plt.show();

### Top 20 seasons by ratings and votes

In [None]:
season_ratings = imdb_data.groupby(["tv", "encoded_title", "season"], as_index = False)["rating"].mean()

In [None]:
season_ratings["season"] = season_ratings["season"].astype(str)

In [None]:
season_ratings["tv_season"] = season_ratings["tv"] + "," + season_ratings["season"]

In [None]:
season_ratings = season_ratings.sort_values("rating", ascending = False).reset_index()
season_ratings_top20 = season_ratings.loc[:20, ["tv_season", "rating"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv_season", x = "rating", data = season_ratings_top20, color = "blue")
plt.title("Top 20 rated TV seasons on IMDb", fontsize = 22)
plt.xlabel("TV show", fontsize = 16)
plt.ylabel("Average Rating Per Episode", fontsize = 16)
#plt.xticks(rotation = 60)
plt.xlim(9,10)
plt.show();

In [None]:
season_votes = imdb_data.groupby(["tv", "encoded_title", "season"], as_index = False)["total_votes"].mean()

In [None]:
season_votes["season"] = season_votes["season"].astype(str)

In [None]:
season_votes["tv_season"] = season_votes["tv"] + "," + season_votes["season"]

In [None]:
season_votes = season_votes.sort_values("total_votes", ascending = False).reset_index()
season_votes_top20 = season_votes.loc[:20, ["tv_season", "total_votes"]]

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv_season", x = "total_votes", data = season_votes_top20, color = "red")
plt.title("Top 20 TV seasons with the most votes on IMDb", fontsize = 22)
plt.xlabel("TV show", fontsize = 16)
plt.ylabel("Average Votes Per Episode", fontsize = 16)
#plt.xticks(rotation = 60)
#plt.ylim(8.8,9.6)
plt.show();

### Comparison between IMDb ratings and IMDb ratings by episode

In [None]:
top_250_new = top_250.merge(mean_ratings, on = "encoded_title")

In [None]:
top_250_new = top_250_new.drop('tv', axis = 1)

In [None]:
top_250_new = top_250_new[["title", "rating_x", "rating_y", "total_votes", "encoded_title"]]

In [None]:
top_250_new.to_csv("df.csv")

In [None]:
plt.figure(figsize = (20,12))
sns.scatterplot(x = "rating_x", y = "rating_y", data = top_250_new, color = "blue")
plt.title("Average Ratings by Episodes vs IMDb Ratings", fontsize = 22)
plt.xlabel("IMDb Ratings", fontsize = 16)
plt.ylabel("Average Ratings by Episodes", fontsize =16)
plt.show();

### Which TV show produces the best episodes?

In [None]:
best_episodes = top_episodes.groupby(["tv_show", "encoded_title"], as_index = False)["rating"].count()

In [None]:
best_episodes = best_episodes.rename(columns={"rating":"rating_count"})

In [None]:
best_episodes = best_episodes.sort_values("rating_count", ascending = False)
best_episodes_top10 = best_episodes.loc[:10,:]

In [None]:
best_episodes_top10

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv_show", x = "rating_count", data = best_episodes_top10, color = "blue")
plt.title("Top 10 shows with the most episodes among the best episodes", fontsize = 22)
plt.xlabel("TV show", fontsize = 16)
plt.ylabel("Frequency", fontsize = 16)
plt.show();

In [None]:
imdb_episodes_count = imdb_data.groupby(["tv", "encoded_title"], as_index = False)["rating"].count()

In [None]:
imdb_episodes_count = imdb_episodes_count.rename(columns={"rating":"episode_count"})

In [None]:
best_episodes_prop = imdb_episodes_count.merge(best_episodes, on = "encoded_title")

In [None]:
best_episodes_prop

In [None]:
best_episodes_prop["rating_prop"] = best_episodes_prop["rating_count"]/best_episodes_prop["episode_count"]
best_episodes_prop = best_episodes_prop.sort_values("rating_prop", ascending = False).reset_index(drop = True)
best_episodes_prop_top10 = best_episodes_prop.loc[:10,:]

In [None]:
best_episodes_prop_top10

In [None]:
plt.figure(figsize = (20,12))
sns.barplot(y = "tv", x = "rating_prop", data = best_episodes_prop_top10, color = "blue")
plt.title("Top 10 shows with the highest proportions of their episodes among the best episode", fontsize = 22)
plt.xlabel("TV show", fontsize = 16)
plt.ylabel("Proportion", fontsize = 16)
plt.show();