# Basic recommender for MovieLens data

## Imports

In [101]:
import pandas as pd
import numpy as np
import re
import sklearn

from matplotlib import pyplot as plt


## Read data

In [102]:
data_path = 'data/movielens_latest_small/{}.csv'

ratings_data = pd.read_csv(data_path.format('ratings'))
movies_data = pd.read_csv(data_path.format('movies'))
tags_data = pd.read_csv(data_path.format('tags'))
links_data = pd.read_csv(data_path.format('links'))

## Data preprocessing

In [103]:
def get_movie_year(title):
    title_re = re.compile(r'.+[\s,-]\(?(\d\d\d\d)\)')
    year = title_re.search(title)
    if year:
        return year.group(1)
    
def get_movie_title(full_title):
    title_re = re.compile(r'(.+)[\s,-]\(?(\d\d\d\d)\)')
    title = title_re.search(full_title)
    if title:
        return title.group(1)
    
movies_data["movie_year"] = movies_data["title"].apply(get_movie_year).fillna("1990").astype(int)
movies_data["title"] = movies_data["title"].apply(get_movie_title)

movie_genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", 
                "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
for genre in movie_genres:
    movies_data[genre] = movies_data["genres"].apply(lambda x: int(genre in x))

movies_data = movies_data.drop("genres", 1)

In [104]:
ratings_data["time"] = ratings_data["timestamp"].apply(pd.to_datetime, unit='s')
ratings_data["year"] = ratings_data["time"].apply(lambda x: x.year)
ratings_data["month"] = ratings_data["time"].apply(lambda x: x.month)
ratings_data["day"] = ratings_data["time"].apply(lambda x: x.day)
ratings_data["hour"] = ratings_data["time"].apply(lambda x: x.hour)
ratings_data["rating"] = ratings_data["rating"].astype(int)

ratings_data = ratings_data.drop("timestamp", 1)
ratings_data = ratings_data.drop("time", 1)


In [105]:
ratings_data.head()


Unnamed: 0,userId,movieId,rating,year,month,day,hour
0,1,1,5,1996,11,4,14
1,1,2,3,1996,11,10,16
2,1,10,3,1996,11,10,16
3,1,32,4,1996,11,10,16
4,1,34,4,1996,11,10,16


In [106]:
ratings_table = ratings_data.pivot_table(columns=["movieId"], index=['userId'], values='rating').astype(float)
rated_items = ratings_table.notnull()

ratings_table.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,132796,133419,133545,133897,134170,134368,134393,134783,134853,135887
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,,,,,,,3.0,...,,,,,,,,,,
2,3.0,3.0,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,3.0,4.0,,,,,...,,,,,,,,,,


##Base Recommeder

In [143]:
class BaseRecommender(object):
    def get_predictions(self, ratings):
        raise NotImplementedError()
    
    def test_predictions(self, train_data, test_data):
        raise NotImplementedError()
    
    def get_top_items(self, ratings, n=5):
        raise NotImplementedError()

## Non-personalized recommender

In [147]:

# filtered_ratings = ratings_data.copy()
# filtered_ratings['rating'] = filtered_ratings.groupby('movieId')['rating'].filter(lambda x:x.count() > 10)

class NonPersonalizedRecommender(BaseRecommender):
    
    def get_predictions(self, ratings):
        grouped_ratings = ratings.groupby('movieId')['rating']
        return grouped_ratings.agg([np.mean])

    def test_predictions(self, train_data, test_data, column='mean'):
        test_predictions = self.get_predictions(train_data)
        merged_predictions = test_predictions.merge(test_data, left_index=True, right_on='movieId', how='right')
        merged_predictions[column] = merged_predictions[column].fillna(0)
        return merged_predictions[column]
    
    def get_top_items(self, ratings, n=5):
        predictions_table = self.get_predictions(ratings)
        return predictions_table.sort_values(by='mean', ascending=False)[:n]
    
    def __positive_ratings(self, x):
        return x[x >= 4].count() / x.count()

    def __popularity(self, x):
        return x.count()


## Cross-validation

In [149]:
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import KFold
from random import sample

def rmse(actual, predictions):
    return np.sqrt(mean_squared_error(actual, predictions))

def evaluate_predictions(ratings, recommender, metrics=rmse):
    kf = KFold(n=ratings.shape[0], n_folds=10, shuffle=True)

    predictions = []
    actual = []

    for train, test in kf:
        train_data = ratings_data.iloc[train]
        test_data = ratings_data.iloc[test]
        
        actual.extend(test_data['rating'])
        predictions.extend(recommender.test_predictions(train_data, test_data))
    
    return metrics(actual, predictions)

print(evaluate_predictions(ratings_data, NonPersonalizedRecommender()))

1.18254618697


## Collaborative filtering

In [10]:
from sklearn.cross_validation import KFold
from sklearn.neighbors import KNeighborsRegressor
import math


def kfolds(data, n_folds=3):
    folds = []
    sets = []
    k = math.floor(len(data) / n_folds)
    rest = len(data) % n_folds
    for i in range(n_folds):
        first_ind = i * k
        last_ind = (i + 1) * k 
        if rest > 0:
            last_ind  += 1
            rest -= 1
        folds.append(data.iloc[first_ind : last_ind])
    for i in range(n_folds):
        sets.append((folds[i], pd.concat([folds[j] for j in range(n_folds) if j != i])))
        
    return sets

def predict_values(train_data, train_target, test_data, model=KNeighborsRegressor()):
    model.fit(train_data, train_target)
    return model.predict(test_data)
    
def cross_validation(data, movie):
    cv_pred = []
    cv_actual = []
    for test_set, train_set in kfolds(data):
        train_predictors = train_set.drop(movie, axis=1)
        train_target = train_set[movie]
        test_predictors = test_set.drop(movie, axis=1)
        cv_pred.append(predict_values(train_predictors, train_target, test_predictors))
    return np.concatenate(cv_pred, axis=0)

def predict_for_user(r_data, movie, user):
    cv_pred = []
    cv_actual = []
    train_predictors = r_data.drop(user).drop(movie, axis=1)
    train_target = r_data.drop(user)[movie] 
    test_predictors = r_data.drop(movie, axis=1).iloc[user]
    return predict_values(train_predictors, train_target, test_predictors)




## Error measure

In [13]:
from sklearn.metrics import mean_squared_error
from random import sample

def rmse(actual, predictions):
    return np.sqrt(mean_squared_error(actual, predictions))

predictions = []
actual = []

for user_id in range(1, 5): # sample(ratings_table.columns.values.tolist(), 3):
    rated_by_user = rated_items.iloc[user_id]
    rated_indexes = ratings_table.iloc[user_id][rated_by_user].index
    for movie in rated_indexes:
        predictions.append(predict_for_user(ratings_table[rated_indexes], movie, user_id))
    actual.append(ratings_table.iloc[user_id][rated_indexes])
predictions = np.concatenate(predictions)
actual = pd.concat(actual)
error = rmse(actual, predictions)
print(error)

movieId
1       3
2       3
11      3
17      5
19      4
21      3
34      5
39      3
47      2
95      3
150     4
153     3
160     3
161     4
165     2
173     2
185     3
186     3
196     2
208     3
225     3
231     3
236     4
252     3
253     4
266     4
282     4
288     1
293     3
300     4
       ..
5650    2
5689    4
5693    2
5696    4
5705    2
5707    4
5732    4
5745    4
5747    3
5772    5
5777    3
5780    3
5801    4
5802    3
5816    4
5826    3
5836    4
5847    4
5853    4
5862    3
5868    2
5899    4
5902    5
5911    5
5933    4
5938    4
5940    4
5983    4
6001    4
6184    3
dtype: float64
0.661330974054
