# __Merged movielens and kaggle data analysis and cleaning__

##### Initial setup

In [None]:
%load_ext watermark
%load_ext autoreload

In [None]:
%autoreload 2
%watermark -v -n -m -p numpy,pandas

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append('../../../../')
from src.settings import DATA_DIR

CURRENT_PATH = os.path.abspath(os.path.join(os.pardir))
print(CURRENT_PATH)
print(DATA_DIR)

Import functions

In [None]:
from src.data_processing.dataframe_utils import (drop_unnecessary_cols, rename_cols, show_uniq_vals,
                                                reset_index, sort_values, start_pipeline, 
                                                string_to_lowercase, prepare_summary_table,
                                                merge_tables, rearrange_cols, remove_nan,
                                                filter_greater_than_numeric, drop_unnecessary_cols,
                                                prepare_unique_val_count_table, filter_in_list, filter_greater_than_numeric)

from src.data_processing.preprocess.datasets_merge_utils import (prepare_movie_tags_table, prepare_movie_tag_occurrences_table)

##### Load cleaned datasets

Movielens data

In [None]:
movies = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'ml25m_cleaned', 'movies.csv'))
movies_genres = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'ml25m_cleaned', 'movies_genres.csv'))
ratings = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'ml25m_cleaned', 'ratings.csv'))
links = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'ml25m_cleaned', 'links.csv'))
tags = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'ml25m_cleaned', 'tags.csv'))
tags_genome = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'ml25m_cleaned', 'tags_genome.csv'))
tags_genome_scores = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'ml25m_cleaned', 'tags_genome_scores.csv'))

Kaggle data

In [None]:
movies_kaggle = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'kaggle_movies_cleaned', 'movies_kaggle.csv'))
movies_metadata = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'kaggle_movies_cleaned', 'movies_metadata_kaggle.csv'))
credits = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'kaggle_movies_cleaned', 'credits_kaggle.csv'))
links_kaggle = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'kaggle_movies_cleaned', 'links_kaggle.csv'))
movie_keywords = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'kaggle_movies_cleaned', 'movie_keywords_kaggle.csv'))
keywords_genome = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'kaggle_movies_cleaned', 'keywords_genome_kaggle.csv'))

### __Movielens and kaggle comparison__

#### __Movies__

Movielens movie table

In [None]:
movies.head()

In [None]:
movies.info()

In [None]:
movies_filtered = movies[movies.movieId.isin(selected_movie_ids)]

In [None]:
movies_filtered.info()

Kaggle movie tables

In [None]:
movies_kaggle.head(3)

In [None]:
movies_kaggle.info()

In [None]:
movies_metadata.head(3)

In [None]:
movies_metadata.info()

#### __Ratings__

Movielens ratings

In [None]:
ratings.head()

In [None]:
ratings.info()

#### __Links__

Movielens link table

In [None]:
links.head()

In [None]:
links.info()

In [None]:
links_kaggle.head()

Kaggle link table

In [None]:
links_kaggle.info()

#### __Tags__

Movielens tag tables

In [None]:
tags.head()

In [None]:
tags.info()

In [None]:
tags.sort_values(by=['movieId']).head(10)

In [None]:
tags.sort_values(by=['movieId']).head(10)

In [None]:
tags.info()

Unique tags

In [None]:
tags.tag.unique().shape

Movies amount with tags

In [None]:
tags.movieId.unique().shape

Active users in tagging

In [None]:
tags.userId.unique().shape

In [None]:
tags_genome.head()

In [None]:
tags_genome.info()

In [None]:
tags_genome_scores.head()

In [None]:
tags_genome_scores.info()

Movies amount having tag genome score

In [None]:
tags_genome_scores.movieId.unique().shape

#### __Plot keywords__

Kaggle keywords tables

In [None]:
movie_keywords.head()

In [None]:
movie_keywords.info()

In [None]:
movie_keywords[movie_keywords.keywords == ""]

In [None]:
keywords_genome.head()

In [None]:
keywords_genome.info()

#### __Credits__

Kaggle credits table

In [None]:
credits.head()

In [None]:
credits.info()

### __Merging datasets__

Collaborative filtering models demands user-ratings data. Movies with rate amount above 10 should be considered. 

In [None]:
movie_rating_summary = (ratings
                        .pipe(start_pipeline)
                        .pipe(prepare_summary_table, group_cols=['movieId'], aggr_col='rating', 
                              col_1='rate_amount', col_2='rate_average')
                        .pipe(reset_index))

In [None]:
movie_rating_summary.head()

In [None]:
movie_rating_summary.info()
movie_rating_summary.describe()

##### __Filter movieIds to those having rate amount above 10 and merge with imdbId links value__

In [None]:
movie_rating_above_10 = (movie_rating_summary
                         .pipe(start_pipeline)
                         .pipe(filter_greater_than_numeric, numeric_col='rate_amount', filter_val=10)
                         .pipe(merge_tables, links, left_on='movieId', right_on='movieId'))

In [None]:
movie_rating_above_10.head()

In [None]:
movie_rating_above_10.info()
movie_rating_above_10.describe()

Movielens movies amount with rate amount above 10

In [None]:
movie_rating_above_10.movieId.unique().shape

In [None]:
movie_rating_above_10.imdbId.unique().shape

##### __Merge movies with rate amount above 10 with movies from kaggle__

In [None]:
movies_merged = (movie_rating_above_10
                 .pipe(start_pipeline)
                 .pipe(merge_tables, movies_kaggle, left_on='imdbId', right_on='imdb_id')
                 .pipe(drop_unnecessary_cols, columns=['imdb_id', 'writers', 'tagline'])
                 .pipe(reset_index))

In [None]:
movies_merged.head(3)

In [None]:
movies_merged.info()

##### __Movie tags summary table__

In [None]:
movie_tags_count = (tags
                    .pipe(start_pipeline)
                    .pipe(prepare_unique_val_count_table, group_col='movieId', 
                          aggr_col='tag', series_name='tag_count'))

In [None]:
movie_tags_count.head()

In [None]:
movie_tags_count.info()

In [None]:
movie_tag_occurs = (movie_tags_count
                    .pipe(start_pipeline)
                    .pipe(prepare_movie_tag_occurrences_table, group_col='movieId', 
                          dict_cols=['tag','tag_count'], reset_idx_name='unique_tag_occurrences')
                    .pipe(reset_index))

In [None]:
movie_tag_occurs.head()

In [None]:
movie_unique_tags_summary = (movie_tags_count
                             .pipe(start_pipeline)
                             .pipe(drop_unnecessary_cols, columns=['tag_count'])
                             .pipe(prepare_movie_tags_table, group_cols=['movieId'], aggr_col='tag', 
                                   count_col='unique_tag_amount', list_col='unique_tag_list')
                             .pipe(merge_tables, movie_tag_occurs, left_on='movieId', right_on='movieId')
                             .pipe(reset_index))

In [None]:
movie_unique_tags_summary.head()

In [None]:
movie_users_tags_summary = (tags
                             .pipe(start_pipeline)
                             .pipe(drop_unnecessary_cols, columns=['timestamp'])
                             .pipe(prepare_movie_tags_table, group_cols=['movieId'], aggr_col='tag', 
                                  count_col='users_tags_amount', list_col='users_tags_list')
                             .pipe(reset_index))

In [None]:
movie_users_tags_summary.head()

__Movie tag info summary table__

In [None]:
movie_tags_summary = (movie_unique_tags_summary
                      .pipe(start_pipeline)
                      .pipe(merge_tables, movie_users_tags_summary, left_on='movieId', right_on='movieId')
                      .pipe(reset_index))

In [None]:
movie_tags_summary.head()

In [None]:
movie_tags_summary.info()
movie_tags_summary.describe()

##### __Merge movies with tags summary__

In [None]:
movies_merged_tags = (movies_merged
                      .pipe(start_pipeline)
                      .pipe(merge_tables, movie_tags_summary, left_on='movieId', right_on='movieId')
                      .pipe(remove_nan, columns=['storyline', 'title'])
                      .pipe(reset_index))

In [None]:
movies_merged_tags.head(2)

In [None]:
movies_merged_tags.info()

In [None]:
show_uniq_vals(movies_merged_tags.drop(['unique_tag_occurrences'], axis=1))

Some movies has different release dates but same title

In [None]:
duplicated_title = movies_merged_tags[movies_merged_tags.duplicated(subset=['title'], keep=False)]
duplicated_title = duplicated_title.sort_values(by=['title']).reset_index()
duplicated_title.head(4)

In [None]:
duplicated_title.info()

##### __Filter users with rate amount above 150__

In [None]:
user_rating_summary = (ratings
                        .pipe(start_pipeline)
                        .pipe(prepare_summary_table, group_cols=['userId'], aggr_col='rating', 
                              col_1='rate_amount', col_2='rate_average')
                        .pipe(filter_greater_than_numeric, numeric_col='rate_amount', filter_val=150)
                        .pipe(reset_index))

In [None]:
user_rating_summary.head()

In [None]:
user_rating_summary.info()
user_rating_summary.describe()

##### __Get selected movie ids, imdb ids and users ids to filter tables__

In [None]:
selected_user_ids = user_rating_summary.userId.values

In [None]:
selected_movie_ids = movies_merged_tags.movieId.values

In [None]:
selected_movie_ids

In [None]:
selected_imdb_ids = movies_merged_tags.imdbId.values

In [None]:
selected_imdb_ids

##### __Filter ratings table by selected movie ids__

In [None]:
filtered_ratings = (ratings
                    .pipe(start_pipeline)
                    .pipe(filter_in_list, col='userId', filter_list=selected_user_ids)
                    .pipe(filter_in_list, col='movieId', filter_list=selected_movie_ids)
                    .pipe(reset_index))

In [None]:
filtered_ratings.head()

In [None]:
filtered_ratings.info()

Movies having rates after filtering

In [None]:
filtered_ratings.movieId.unique().shape

In [None]:
filtered_ratings.userId.unique().shape

In [None]:
show_uniq_vals(filtered_ratings)

##### __Filter links table by selected movie ids__

In [None]:
filtered_links = (links
                  .pipe(start_pipeline)
                  .pipe(filter_in_list, col='movieId', filter_list=selected_movie_ids)
                  .pipe(reset_index))

In [None]:
filtered_links.head()

In [None]:
filtered_links.info()

##### __Filter tags table by selected movie ids__

In [None]:
filtered_tags = (tags
                  .pipe(start_pipeline)
         
                 .pipe(filter_in_list, col='movieId', filter_list=selected_movie_ids)
                  .pipe(reset_index))

In [None]:
filtered_tags.head()

In [None]:
filtered_tags.info()

In [None]:
filtered_tags.userId.unique().shape

In [None]:
show_uniq_vals(filtered_tags)

In [None]:
user_tags_summary = (tags
                     .pipe(start_pipeline)
                     .pipe(prepare_summary_table, group_cols=['userId'], aggr_col='tag', 
                              col_1='tag_amount')
                     .pipe(reset_index))

In [None]:
user_tags_summary.head()

In [None]:
user_tags_summary.info()
user_tags_summary.describe()

Check is users giving tags are subset of users giving ratings

In [None]:
users_tags = user_tags_summary.userId.values

In [None]:
user_ratings = filtered_ratings_2.userId.unique()

In [None]:
set(users_tags).issubset(set(user_ratings))

Movies having tags after filtering

In [None]:
filtered_tags.movieId.unique().shape

##### __Filter tag genome scores table by selected movie ids__

In [None]:
filtered_tags_genome_scores = (tags_genome_scores
                              .pipe(start_pipeline)
                              .pipe(filter_in_list, col='movieId', filter_list=selected_movie_ids)
                              .pipe(reset_index))

In [None]:
filtered_tags_genome_scores.head()

In [None]:
filtered_tags_genome_scores.info()

Movie amount in tag genome scores after filtering

In [None]:
filtered_tags_genome_scores.movieId.unique().shape

##### __Filter movies metadata table by selected imdb ids__

In [None]:
filtered_metadata = (movies_metadata
                      .pipe(start_pipeline)
                      .pipe(filter_in_list, col='imdb_id', filter_list=selected_imdb_ids)
                      .pipe(reset_index))

In [None]:
filtered_metadata.head(2)

In [None]:
filtered_metadata.info()