# Basic recommender for MovieLens data

## Imports

In [None]:
import pandas as pd
import numpy as np
import re
import sklearn

from matplotlib import pyplot as plt


## Read data

In [None]:
data_path = 'data/movielens_latest_small/{}.csv'

ratings_data = pd.read_csv(data_path.format('ratings'))
movies_data = pd.read_csv(data_path.format('movies'))
tags_data = pd.read_csv(data_path.format('tags'))
links_data = pd.read_csv(data_path.format('links'))

## Data preprocessing

In [None]:
def get_movie_year(title):
    title_re = re.compile(r'.+[\s,-]\(?(\d\d\d\d)\)')
    year = title_re.search(title)
    if year:
        return year.group(1)
    
def get_movie_title(full_title):
    title_re = re.compile(r'(.+)[\s,-]\(?(\d\d\d\d)\)')
    title = title_re.search(full_title)
    if title:
        return title.group(1)
    
movies_data["year"] = movies_data["title"].apply(get_movie_year).fillna("1990").astype(int)
movies_data["title"] = movies_data["title"].apply(get_movie_title)

movie_genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", 
                "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
for genre in movie_genres:
    movies_data[genre] = movies_data["genres"].apply(lambda x: int(genre in x))

movies_data = movies_data.drop("genres", 1)

In [None]:
ratings_data["time"] = ratings_data["timestamp"].apply(pd.to_datetime, unit='s')
ratings_data["year"] = ratings_data["time"].apply(lambda x: x.year)
ratings_data["month"] = ratings_data["time"].apply(lambda x: x.month)
ratings_data["day"] = ratings_data["time"].apply(lambda x: x.day)
ratings_data["hour"] = ratings_data["time"].apply(lambda x: x.hour)
ratings_data["rating"] = ratings_data["rating"].astype(int)

# ratings_data = movies_data.drop("timestamp", 1)
# ratings_data = movies_data.drop("time", 1)


In [None]:
print(ratings_data.head())
print(movies_data.head())

In [None]:
ratings_table = ratings_data.pivot_table(columns=["movieId"], index=['userId'], values='rating').astype(float)
rated_items = ratings_table.notnull()
for movie in ratings_table.columns.values:
    ratings_table[movie] = ratings_table[movie].fillna(ratings_table[movie].median())


## Collaborative filtering

In [None]:
from sklearn.cross_validation import KFold
from sklearn.neighbors import KNeighborsRegressor
import math


def kfolds(data, n_folds=3):
    folds = []
    sets = []
    k = math.floor(len(data) / n_folds)
    rest = len(data) % n_folds
    for i in range(n_folds):
        first_ind = i * k
        last_ind = (i + 1) * k 
        if rest > 0:
            last_ind  += 1
            rest -= 1
        folds.append(data.iloc[first_ind : last_ind])
    for i in range(n_folds):
        sets.append((folds[i], pd.concat([folds[j] for j in range(n_folds) if j != i])))
        
    return sets

def predict_values(train_data, train_target, test_data, model=KNeighborsRegressor()):
    model.fit(train_data, train_target)
    return model.predict(test_data)
    
def cross_validation(data, movie):
    cv_pred = []
    for test_set, train_set in kfolds(data):
        train_predictors = train_set.drop(movie_id, axis=1)
        train_target = train_set[movie]
        test_predictors = test_set.drop(movie_id, axis=1)
        cv_pred.append(predict_values(train_predictors, train_target, test_predictors))
    return np.concatenate(cv_pred, axis=0)


## Error measure

In [None]:
from sklearn.metrics import mean_squared_error
from random import sample

def rmse(actual, predictions):
    return np.sqrt(mean_squared_error(actual, predictions))

predictions = []
actual = []

for movie_id in range(1, 5):
    movie_predictions = cross_validation(ratings_table, movie_id)
    rated_movie_ixs = ratings_table[rated_items[movie_id]].index 
    predictions.append(pd.Series(movie_predictions, index=ratings_table.index)[rated_movie_ixs])
    actual.append(ratings_table[movie_id][rated_movie_ixs])

predictions = pd.concat(predictions)
actual = pd.concat(actual)
    
error = rmse(actual, predictions)
