Preprocessing

- Subset dataset to movies/users appearing at least n/m times
- compactify movie ids
- do train/test split?

Output = new versions of...

rating.csv. As before except
- no timestamp column
- use compactified movieIds
- add val/train flag
- add 'y' col (centred)
- add 'yscaled' col

movie.csv. As before except
- new compactified movieIds
- parse out base title and year into separate cols (keeping original as well - maybe as 'key' column)
- nratings col
- avg_rating col

Also, some other file mapping between old and new movie ids (just in case that's useful later?)
Or maybe just store in movie.csv

In [None]:
import random
from functools import lru_cache
import os
import math

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras

import sklearn.preprocessing
from sklearn.preprocessing import LabelEncoder

In [None]:
os.listdir('../input')

In [None]:
# Hack for running on kernels and locally
RUNNING_ON_KERNELS = 'KAGGLE_WORKING_DIR' in os.environ
input_dir = '../input' if RUNNING_ON_KERNELS else '../input/movies'
out_dir = '.' if RUNNING_ON_KERNELS else '../input/movielens_preprocessed'

rating_path = os.path.join(input_dir, 'rating.csv')
df = pd.read_csv(rating_path, usecols=['userId', 'movieId', 'rating'])
# Shuffle (reproducibly)
df = df.sample(frac=1, random_state=1).reset_index(drop=True)

# Partitioning train/val according to behaviour of keras.Model.fit() when called with
# validation_split kwarg (which is to take validation data from the end as a contiguous
# chunk)
val_split = .05
n_ratings = len(df)
n_train = math.floor(n_ratings * (1-val_split))
itrain = df.index[:n_train]
ival = df.index[n_train:]

# Compactify movie ids. 
movie_id_encoder = LabelEncoder()
# XXX: Just fitting globally for simplicity. See movie_helpers.py for more 'principled'
# approach. I don't think there's any realistically useful data leakage here though.
#orig_movieIds = df['movieId']
df['movieId'] = movie_id_encoder.fit_transform(df['movieId'])

# Add centred target variable
df['y'] = df['rating'] - df.loc[itrain, 'rating'].mean()

SCALE = 0
if SCALE:
    # Add version of target variable scale to [0, 1]
    yscaler = sklearn.preprocessing.MinMaxScaler()
    yscaler.fit(df.loc[itrain, 'rating'].values.reshape(-1, 1))
    df['y_unit_scaled'] = yscaler.transform(df['rating'].values.reshape(-1, 1))

path = os.path.join(out_dir, 'rating.csv')
df.to_csv(path, index=False)

In [None]:
# Save a 10% sample of ratings for exercises (with re-compactified movieIds, and mapping back to canonical movie ids)
n_mini = 2 * 10**6
df_mini = df.sort_values(by='userId').head(n_mini).copy()
df_mini['movieId_orig'] = df.loc[df_mini.index, 'movieId']
mini_encoder = LabelEncoder()
df_mini['movieId'] = mini_encoder.fit_transform(df_mini['movieId'])
# Shuffle
df_mini = df_mini.sample(frac=1, random_state=1)

# Recalculate y (just to be totally on the level. Very little opportunity for contamination here.)
# Actually, the mean here turns out to be noticeably different (.05 higher). Probably because we're
# Taking the front 10% after sorting by userId, and presumably userId ordering is actually informative
# in some way (new users vs. old users?). More principled approach would be to take a subset of 10% of
# userids. But then we'd have to compactify those... This is fine for now.
n_mini_train = math.floor(n_mini * (1-val_split))
mini_train_rating_mean = df_mini.iloc[:n_mini_train]['rating'].mean()
df_mini['y'] = df_mini['rating'] - mini_train_rating_mean

path = os.path.join(out_dir, 'mini_rating.csv')
df_mini.to_csv(path, index=False)

In [None]:
def munge_title(title):
    i = title.rfind(' (')
    if i != -1:
        title = title[:i]
    for suff_word in ['The', 'A', 'An']:
        suffix = ', {}'.format(suff_word)
        if title.endswith(suffix):
            title = suff_word + ' ' + title[:-len(suffix)]
    return title

def get_year(title):
    l = title.rfind('(') + 1
    try:
        return int(title[l:l+4])
    except ValueError:
        print(title, end='\t')
        return 0

movie_path = os.path.join(input_dir, 'movie.csv')
movie_df = pd.read_csv(movie_path)
mdf = movie_df

# XXX: hack
assert mdf.loc[
    mdf.movieId==64997,
    'title'].iloc[0] == 'War of the Worlds (2005)'
mdf.loc[
    mdf.movieId==64997,
    'title'
] = 'War of the Worlds (2005)x'

#mdf['movieId_orig'] = mdf['movieId']
n_orig = len(mdf)

# There are some movies listed in movie.csv which have no ratings. Drop them.
whitelist = set(movie_id_encoder.classes_)
mdf = mdf[mdf['movieId'].isin(whitelist)].copy()
print("Went from {} movies to {} after filtering out movies with no ratings".format(
    n_orig, len(mdf)
))

# New, compact movie Ids
mdf['movieId'] = movie_id_encoder.transform(mdf['movieId'].values)

mdf = mdf.sort_values(by='movieId').reset_index(drop=True)

# By default use original title field (which includes year of release) as unique key
mdf['key'] = mdf['title']

mdf['year'] = mdf['title'].map(get_year)
mdf['title'] = mdf['title'].map(munge_title)

# For movies whose munged title are unique, use it as their key
title_counts = mdf.groupby('title').size()
unique_titles = title_counts.index[title_counts == 1]
unique_ids = mdf.index[mdf.title.isin(unique_titles)]
mdf.loc[unique_ids, 'key'] = mdf.loc[unique_ids, 'title']

mdf['n_ratings'] = df.groupby('movieId').size()
# NB: Calculated only over training set. (Though maybe this should be consistent with n_ratings)
mean_ratings = df.loc[itrain].groupby('movieId')['rating'].mean()
mdf['mean_rating'] = mean_ratings

path = os.path.join(out_dir, 'movie.csv')
mdf.to_csv(path)

In [None]:
# Nvm, I don't think this is necessary.
if 0:
    popularity_thresh = 1000

    popular = mdf[mdf['n_ratings'] >= popularity_thresh]

    pop_ratings = df[df['movieId'].isin(popular.index)].copy()

    print("Went from {:,} ratings to {:,} after applying threshold of {} ratings per movie".format(
        len(df), len(pop_ratings), popularity_thresh
    ))

    path = os.path.join(out_dir, 'mainstream_rating.csv')
    pop_ratings.to_csv(path, index=False)

In [None]:
mdf.head()

In [None]:
df.head()