# __Merged movie dataset visualization__

##### Initial setup

In [None]:
%load_ext watermark
%load_ext autoreload

In [None]:
%autoreload 2
%watermark -v -n -m -p numpy,scipy,sklearn,pandas,seaborn,recmetrics

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import recmetrics
import sys
import os
from pathlib import Path
sys.path.append('../../../../')
from src.settings import DATA_DIR, PROJECT_DIR

CURRENT_PATH = os.path.abspath(os.path.join(os.pardir))
print(CURRENT_PATH)
print(DATA_DIR)

Import functions

In [None]:
from src.data_processing.dataframe_utils import (start_pipeline, remove_nan, drop_unnecessary_cols, reset_index, rename_cols,
                                                     sort_values, unpivot_dataframe, expand_column)
from src.data_processing.visualization.plot_utils import plot_counts, change_bars_width, plot_bar, make_wordcloud

##### Load cleaned data

In [None]:
MERGED_DATA_PATH = os.path.join(DATA_DIR, 'processed', 'merged_ml25m_kaggle')
MERGED_DATA_PATH

In [None]:
movies = pd.read_csv(os.path.join(MERGED_DATA_PATH, 'movies_merged.csv'))
metadata = pd.read_csv(os.path.join(MERGED_DATA_PATH, 'movies_metadata_merged.csv'))
ratings = pd.read_csv(os.path.join(MERGED_DATA_PATH, 'ratings_merged.csv'))
tags = pd.read_csv(os.path.join(MERGED_DATA_PATH, 'tags_merged.csv'))

## __Data summary__

#### Movies

In [None]:
movies.head(2)

In [None]:
movies.info()
movies.describe()

#### Metadata

In [None]:
metadata.head(2)

In [None]:
metadata.info()
metadata.describe()

#### Ratings

In [None]:
ratings.head()

In [None]:
ratings.info()
ratings.describe()

#### Tags

In [None]:
tags.head()

In [None]:
tags.info()
tags.describe()

## __Visualizations__

In [None]:
sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5})

VISUALIZATION_DIR = os.path.join(PROJECT_DIR, 'reports/figures/sec2_data')
Path(VISUALIZATION_DIR).mkdir(parents=True, exist_ok=True)
VISUALIZATION_DIR

#### __Movie count by years__

Movies with release year specified

In [None]:
movies_with_year = movies.dropna(subset=['release_date']).copy(deep=True)
movies_with_year['year'] = movies_with_year['release_date'].map(lambda x: str(x)[0:4])
year_summary = movies_with_year.groupby(['year'])['movieId'].count().reset_index()
year_summary.rename(columns={'movieId': 'movie_amount'}, inplace=True)
year_summary.tail()

In [None]:
plt.figure(figsize=(14,8))
plt.tight_layout()
ax = movies_with_year.groupby('year')['movieId'].count().plot()
ax.set_title('Amount of movie releases by year')
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'movie_amount_by_year.png'), bbox_inches = "tight")

#### __Rate visualization__

##### Rate amount by scale

In [None]:
ax = plot_counts(ratings, count_col='rating', title='Rate counts by scale', palette='coolwarm', annotate=False)
ylabels = [f'{x} mln' for x in ax.get_yticks()/1_000_000]
ax.set_yticklabels(ylabels)
plt.show()

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'rate_count_scales.png'), bbox_inches = "tight")

##### Movie-ratings

In [None]:
movie_ratings = pd.merge(movies, ratings, on='movieId')
movie_ratings.head(2)

In [None]:
movie_ratings.info()

##### Most frequently rated movies

In [None]:
ax = plot_bar(movie_ratings, x=movie_ratings['title'].value_counts()[:10], y=movie_ratings['title'].value_counts()[:10].index, figsize=(10,8),
              title='The most frequently rated films', palette='Purples_d')
xlabels = [f'{x} k' for x in ax.get_xticks()/1_000]
ax.set_xticklabels(xlabels)
plt.show()

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'most_freq_rated_films.png'), bbox_inches = "tight")

##### Most highly rating movies

Bayesian weighted estimate similar to IMDb top 250 list method. The Bayesian estimate BE is given as

$BE=\frac{nR+mC}{n+m}$,
where

R - the mean rating from everyone who has seen a particular movie, \
n - the number of times a particular movie has been rated, \
m - the minimum number of votes required to be in the top list, \
C - the mean rating of all films in the entire movie list.

Movie must have at least m ratings to consider for that list. Let assume $m = 1000$.

In [None]:
total_avg_rating = ratings['rating'].mean()
m = 1000

top_list = pd.DataFrame()
top_list['mean_rating'] = movie_ratings.groupby(['movieId','title'])['rating'].mean()
top_list['num_ratings'] = movie_ratings.groupby(['movieId','title'])['rating'].count()
top_list['BE'] = (top_list['num_ratings']*top_list['mean_rating']+m*total_avg_rating)/(top_list['num_ratings']+m)
top_list = top_list.sort_values(by='BE', ascending=False).reset_index()
top_list.head(10)

In [None]:
top_list.info()
top_list.describe()

In [None]:
ax = plot_bar(top_list, x=top_list['BE'][:10], y=top_list['title'][:10], title='The most highly rated films', figsize=(10,8), palette='Oranges_d')
ax.set_xlim([0, 5])
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'most_highlyBE_rated_films.png'), bbox_inches = "tight")

##### Rate amount distribution

In [None]:
fig = plt.figure(figsize=(12, 6))
recmetrics.long_tail_plot(df=ratings, 
             item_id_column="movieId", 
             interaction_type="movie ratings", 
             percentage=0.6,
             x_labels=False)

In [None]:
fig.savefig(os.path.join(VISUALIZATION_DIR, 'rating_long_tail_plot.png'), bbox_inches = "tight")

Prepare table with movies which were at least once rated

In [None]:
movie_fq_rate = pd.DataFrame()
movie_fq_rate['ratings_amount'] = movie_ratings.groupby('movieId')['rating'].count()
movie_fq_rate = movie_fq_rate.reset_index()
movie_fq_rate.head()

In [None]:
movie_fq_rate.info()
movie_fq_rate.describe()

In [None]:
plt.figure(figsize=(12,6))
ax = sns.distplot(movie_fq_rate['ratings_amount'], hist=False)
xlabels = [f'%.0f k' % x for x in ax.get_xticks()/1_000]
ax.set_xticklabels(xlabels)
ax.set_title('Distribution of rate amount per movie')
plt.show()

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'rate_amount_distribution.png'), bbox_inches = "tight")

##### Ratings per genre

In [None]:
movies_genres = (movies[['movieId', 'rate_amount', 'rate_average', 'genres']]
                 .pipe(start_pipeline).pipe(expand_column, keep_cols=['movieId', 'rate_amount', 'rate_average'], expand_col='genres')
                .pipe(unpivot_dataframe, keep_cols=['movieId', 'rate_amount', 'rate_average'])
                .pipe(remove_nan, columns=['value']).pipe(drop_unnecessary_cols, columns=['variable'])
                .pipe(rename_cols, colmap_dict={'value': 'genre'})
                .pipe(sort_values, sort_subset=['movieId'])
                .pipe(reset_index))
movies_genres.head()

In [None]:
movies_genres.info()

In [None]:
genre_rates = movies_genres.groupby(by=['genre'])['rate_amount'].sum().sort_values(ascending=False).reset_index()
genre_rates.head()

In [None]:
ax = plot_bar(genre_rates, x='genre', y='rate_amount', title='Rate amounts by movies genre', rotate=True, figsize=(13,6), palette='twilight_d')
ylabels = [f'{x} mln' for x in ax.get_yticks()/1_000_000]
ax.set_yticklabels(ylabels)
for p in ax.patches:
    ax.annotate("{:.2f}".format(p.get_height() / 1_000_000),
                xy=(p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')
plt.show()

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'rate_amount_by_genre.png'), bbox_inches = "tight")

In [None]:
genre_rate_avg = movies_genres.groupby(by=['genre'])['rate_average'].mean().sort_values(ascending=False).reset_index()
genre_rate_avg.head()

In [None]:
ax = plot_bar(genre_rate_avg, x='genre', y='rate_average', title='Rate average by movies genre', rotate=True, annotate=True, ann_format="{:.2f}",
              palette='twilight_shifted_d', figsize=(13,6))
ax.set_ylim([0, 5])

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'rate_avg_by_genre.png'), bbox_inches = "tight")

#### __Tags visualization__

In [None]:
movies_tags = movies[['movieId', 'unique_tag_list', 'unique_tag_amount', 'unique_tag_occurrences', 'users_tags_list', 'users_tags_amount']]
movies_tags

##### User tags amount distribution per movie

In [None]:
plt.figure(figsize=(16, 8))
ax = sns.distplot(movies_tags['users_tags_amount'], hist=False)
xlabels = [f'%.0f k' % x for x in ax.get_xticks()/1_000]
ax.set_xticklabels(xlabels)
ax.set_title('Distribution of user tags amount per movie')
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'user_tags_amount_distribution.png'), bbox_inches = "tight")

##### Frequent tags wordcloud

In [None]:
tags['tag'].unique().shape

In [None]:
tags_occur = tags.groupby(by=['tag'])['movieId'].count().sort_values(ascending=False).reset_index()
tags_occur = tags_occur.rename(columns={'movieId': 'count'})
tags_occur.head()

In [None]:
tags_dict = dict(zip(tags_occur['tag'], tags_occur['count']))
len(tags_dict)

In [None]:
wc = make_wordcloud(tags_dict, max_words=500, mask_img_path=os.path.join(VISUALIZATION_DIR, 'movie_mask_smaller.png'))

plt.figure(figsize=(16,5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wc.to_file(os.path.join(VISUALIZATION_DIR, "movie_tags_wordcloud.png"))

##### Tags by rate

In [None]:
tag_ratings = pd.merge(tags.drop(['timestamp'], axis=1), ratings.drop(['timestamp'], axis=1), 
                       how='left', on=['userId', 'movieId'])

In [None]:
tag_ratings.head()

In [None]:
tag_ratings.info()

Tags without rate

In [None]:
tag_ratings_nan = tag_ratings[tag_ratings['rating'].isnull()]
print('Amount of tags without rate specified: {}'.format(len(tag_ratings_nan.index)))
tag_ratings_nan.head()

In [None]:
tag_ratings_nan.info()

Tags with rates

In [None]:
movie_tags_with_rate = tag_ratings.dropna(subset=['rating'])
movie_tags_with_rate.head()

In [None]:
movie_tags_with_rate.info()

In [None]:
all_tags_avg_rate = ratings['rating'].mean()
all_tags_avg_rate

In [None]:
m = 500

tag_rates_toplist = pd.DataFrame()
tag_rates_toplist['mean_rate'] = tags_with_rate.groupby('tag')['rating'].mean()
tag_rates_toplist['rate_count'] = tags_with_rate.groupby('tag')['rating'].count()
tag_rates_toplist['BE'] = (tag_rates_toplist['rate_count']*tag_rates_toplist['mean_rate']+m*all_tags_avg_rate)/(tag_rates_toplist['rate_count']+m)
tag_rates_toplist = tag_rates_toplist.sort_values(by='BE', ascending=False).reset_index()

In [None]:
tag_rates_toplist.info()

In [None]:
tag_rates_toplist.head()

##### Positive tags wordcloud

In [None]:
positive_tags_dict = dict(zip(tag_rates_toplist['tag'], tag_rates_toplist['BE']))
len(positive_tags_dict)

In [None]:
wc = make_wordcloud(positive_tags_dict, max_words=100, mask_img_path=os.path.join(VISUALIZATION_DIR, 'happy_face_mask.png'))

plt.figure(figsize=(16,5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wc.to_file(os.path.join(VISUALIZATION_DIR, "top_rated_tags_wordcloud.png"))

##### Top lowest rated tags

Tags with rates amont above $r=100$

In [None]:
tags_above_100 = tag_rates_toplist[tag_rates_toplist['rate_count'] > 100].copy(deep=True)
tags_above_100['mean_rate'] = 5 - tags_above_100['mean_rate']
tags_above_100 = tags_above_100.sort_values(by=['mean_rate'], ascending=False)

In [None]:
tags_above_100.info()
tags_above_500.describe()

In [None]:
tags_above_100.head(10)

In [None]:
negative_tags_dict = dict(zip(tags_above_100['tag'], tags_above_100['mean_rate']))
len(negative_tags_dict)

In [None]:
wc = make_wordcloud(negative_tags_dict, max_words=100, mask_img_path=os.path.join(VISUALIZATION_DIR, 'sad_face_mask.png'))

plt.figure(figsize=(16,5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wc.to_file(os.path.join(VISUALIZATION_DIR, "lowest_rated_tags_wordcloud.png"))

#### __Movies amount per original language__

In [None]:
plt.figure(figsize=(16, 7))

min_movies = 30

s = metadata.original_language.value_counts()
lang = s.where(s>min_movies).dropna().to_dict()
lang_keys = list(lang.keys())
lang_values = list(lang.values())

ax = sns.barplot(x=lang_keys, y=lang_values, palette='viridis')

for p in ax.patches:
    ax.annotate('{:.0f}'.format(p.get_height()),
                xy=(p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 6), textcoords='offset points')

ax.set_title(f'Amount of movies per original language (min {min_movies})')
ax.set_yscale('log')
ax.set_ylabel('movies amount')
ax.set_xlabel('original language')

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'movie_amount_by_origin_lang.png'), bbox_inches = "tight")

#### __Movie runtime distribution__

In [None]:
plt.figure(figsize=(14, 7))
ax = sns.distplot(metadata.runtime.where(metadata.runtime<240).dropna(), kde=False)
ax.set_title('Distribution of movies runtime (max 240 min.)')
ax.set_ylabel('movies amount')
ax.set_xlabel('runtime')

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(VISUALIZATION_DIR, 'movie_runtime_distribution.png'), bbox_inches = "tight")