# __Movielens 25m data analysis and cleaning__

#### Initial setup

In [None]:
%load_ext watermark
%load_ext autoreload

In [None]:
%autoreload 2
%watermark -v -n -m -p numpy,pandas,sklego

In [None]:
import pandas as pd
import numpy as np
import sys
import os
from pandas import option_context
from sklego.pandas_utils import log_step
from collections import Counter
sys.path.append('../../../../')
from src.settings import DATA_DIR
CURRENT_PATH = os.path.abspath(os.path.join(os.pardir))
print(CURRENT_PATH)
print(DATA_DIR)

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

Import functions

In [None]:
from src.data_processing.dataframe_utils import (col_to_onehot, start_pipeline, drop_unnecessary_cols, remove_nan, reset_index)

#### Load dataset

In [None]:
ratings = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'ml-25m', 'ratings.csv'))
tags = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'ml-25m', 'tags.csv'))
movies = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'ml-25m', 'movies.csv'))
links = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'ml-25m', 'links.csv'))
genome_scores = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'ml-25m', 'genome-scores.csv'))
genome_tags = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'ml-25m', 'genome-tags.csv'))

### __Movielens data__

In [None]:
def show_uniq_vals(df):
    for col in df:
        print(f'{col} - unique values: {np.unique(df[col].dropna().values).shape}')

#### __Movies__

In [None]:
movies.head()

In [None]:
movies[movies.genres.str.contains("IMAX")].head()

In [None]:
movies.info()

In [None]:
movies.isnull().values.any()

Movies genres onehot table

In [None]:
onehot_col = 'genres'

movie_genres_onehot = (movies
                       .pipe(start_pipeline)
                       .pipe(col_to_onehot, onehot_col))

In [None]:
 with option_context('display.max_column', None):
    display(movie_genres_onehot.head())

Changing '(no ganres listesd)' value to NaN and remove 'IMAX' from genres list

In [None]:
from typing import Union

@log_step
def insert_nan(df: pd.DataFrame, substitute_val: str) -> pd.DataFrame:
    return df.replace(substitute_val, np.NaN)

def filter_values(string: object, drop_val: str, sep="|") -> Union[str, float]:
    return sep.join([val for val in str(string).split(sep) if val != drop_val]) if string != 'nan' else np.nan

@log_step
def remove_val_from_concat_string(df: pd.DataFrame, colname : str, value: str, sep="|") -> pd.DataFrame:  
    df[colname] = df[colname].apply(lambda x: filter_values(str(x), value, sep=sep))
    return df

In [None]:
substitute_val = '(no genres listed)'
column = 'genres'
drop_value = 'IMAX'

movies_cleaned = (movies
                  .pipe(start_pipeline)
                  .pipe(insert_nan, substitute_val)
                  .pipe(remove_val_from_concat_string, column, drop_value))

In [None]:
movies_cleaned.head()

In [None]:
movies_cleaned[movies_cleaned['title'] == 'Wings of Courage (1995)']

In [None]:
movies_cleaned[movies_cleaned.genres.str.contains("IMAX").fillna(False)].head()

In [None]:
movies_cleaned.info()

In [None]:
show_uniq_vals(movies_cleaned)

In [None]:
movies_cleaned[movies_cleaned.genres.isna()].head()

#### __Ratings__

In [None]:
ratings.head()

In [None]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.head()

In [None]:
ratings.info()
ratings.describe()

In [None]:
show_uniq_vals(ratings)

In [None]:
ratings.isnull().values.any()

In [None]:
ratings[ratings.duplicated(subset=['userId', 'movieId'], keep=False)]

In [None]:
ratings.movieId.unique().shape

#### __Links__

In [None]:
links.head()

In [None]:
links.info()

In [None]:
show_uniq_vals(links)

In [None]:
links_cleaned = (links
                 .pipe(start_pipeline)
                 .pipe(drop_unnecessary_cols, columns=['tmdbId'])
                 .pipe(reset_index))

In [None]:
links_cleaned.head()

In [None]:
links_cleaned.info()

In [None]:
links_cleaned.isnull().any()

#### __Tags__

In [None]:
tags.head()

In [None]:
tags.info()

In [None]:
tags.isnull().any()

In [None]:
tags[tags['tag'].isnull()].head()

In [None]:
tags_cleaned = (tags
                .pipe(start_pipeline)
                .pipe(remove_nan, columns=['tag'])
                .pipe(reset_index))

In [None]:
tags_cleaned.info()

Tags amount describing movies

In [None]:
show_uniq_vals(tags_cleaned)

In [None]:
tags_cleaned.tag.unique().shape

#### __Tags genome__

In [None]:
genome_tags.head()

In [None]:
genome_tags.info()

In [None]:
show_uniq_vals(genome_tags)

In [None]:
genome_tags.isnull().any()

In [None]:
genome_tags[genome_tags.duplicated(subset=['tag'], keep=False)]

#### __Tags genome scores__

In [None]:
genome_scores.head()

In [None]:
genome_scores.info()

Movies having genome scores

In [None]:
show_uniq_vals(genome_scores)

In [None]:
genome_scores.movieId.unique().shape

Tags included in genome scores

In [None]:
genome_scores.tagId.unique().shape

In [None]:
genome_scores[genome_scores.duplicated(subset=['movieId', 'tagId'])]

In [None]:
genome_scores.isnull().any()