# Basic recommender for MovieLens data

## Imports

In [1]:
import pandas as pd
import numpy as np
import re
import sklearn

from matplotlib import pyplot as plt


## Read data

In [2]:
data_path = 'data/movielens_latest_small/{}.csv'

ratings_data = pd.read_csv(data_path.format('ratings'))
movies_data = pd.read_csv(data_path.format('movies'), index_col='movieId')
tags_data = pd.read_csv(data_path.format('tags'))
links_data = pd.read_csv(data_path.format('links'))

## Data preprocessing

In [3]:
def get_movie_year(title):
    title_re = re.compile(r'.+[\s,-]\(?(\d\d\d\d)\)')
    year = title_re.search(title)
    if year:
        return year.group(1)
    
def get_movie_title(full_title):
    title_re = re.compile(r'(.+)[\s,-]\(?(\d\d\d\d)\)')
    title = title_re.search(full_title)
    if title:
        return title.group(1)
    
movies_data["movie_year"] = movies_data["title"].apply(get_movie_year).fillna("1990").astype(int)
movies_data["title"] = movies_data["title"].apply(get_movie_title)

movie_genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", 
                "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
for genre in movie_genres:
    movies_data[genre] = movies_data["genres"].apply(lambda x: int(genre in x))

movies_data = movies_data.drop("genres", 1)
movies_data = movies_data.fillna(0)

In [4]:
tags = pd.Series(tags_data['tag'].unique())
movies_tags = tags_data.pivot_table(columns=["tag"], index=['movieId'], values='userId')
movies_tags[movies_tags.notnull()] = 1
movies_tags = movies_tags.fillna(0)

movies_data = movies_data.merge(movies_tags, left_index=True, right_index=True, how='left').fillna(0)
movies_data


Unnamed: 0_level_0,title,movie_year,Action_x,Adventure_x,Animation,Children's,Comedy_x,Crime,Documentary,Drama,...,wine,witty dialogue,women,workplace,writing,wrongful imprisonment,younger men,zither,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,1995,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jumanji,1995,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men,1995,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Waiting to Exhale,1995,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Heat,1995,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Sabrina,1995,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Tom and Huck,1995,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Sudden Death,1995,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,GoldenEye,1995,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
ratings_data["time"] = ratings_data["timestamp"].apply(pd.to_datetime, unit='s')
ratings_data["year"] = ratings_data["time"].apply(lambda x: x.year)
ratings_data["month"] = ratings_data["time"].apply(lambda x: x.month)
ratings_data["day"] = ratings_data["time"].apply(lambda x: x.day)
ratings_data["hour"] = ratings_data["time"].apply(lambda x: x.hour)
ratings_data["rating"] = ratings_data["rating"].astype(int)

ratings_data = ratings_data.drop("timestamp", 1)
ratings_data = ratings_data.drop("time", 1)


In [6]:
movies_data.head()


Unnamed: 0_level_0,title,movie_year,Action_x,Adventure_x,Animation,Children's,Comedy_x,Crime,Documentary,Drama,...,wine,witty dialogue,women,workplace,writing,wrongful imprisonment,younger men,zither,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,1995,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jumanji,1995,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men,1995,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Waiting to Exhale,1995,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
ratings_table = ratings_data.pivot_table(columns=["movieId"], index=['userId'], values='rating').astype(float).fillna(0)
rated_items = ratings_table.notnull()
ratings_table.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,132796,133419,133545,133897,134170,134368,134393,134783,134853,135887
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,3,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
2,3,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,0,0,0,3,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##Base Recommeder

In [8]:
class BaseRecommender(object):
    def get_predictions(self, ratings):
        raise NotImplementedError()
    
    def test_predictions(self, train_data, test_data):
        raise NotImplementedError()
    
    def get_top_items(self, ratings, n=5):
        raise NotImplementedError()

## Non-personalized recommender

In [9]:

# filtered_ratings = ratings_data.copy()
# filtered_ratings['rating'] = filtered_ratings.groupby('movieId')['rating'].filter(lambda x:x.count() > 10)

class NonPersonalizedRecommender(BaseRecommender):
    
    def get_predictions(self, ratings):
        grouped_ratings = ratings.groupby('movieId')['rating']
        return grouped_ratings.agg([np.mean])

    def test_predictions(self, train_data, test_data, column='mean'):
        test_predictions = self.get_predictions(train_data)
        merged_predictions = test_predictions.merge(test_data, left_index=True, right_on='movieId', how='right')
        merged_predictions[column] = merged_predictions[column].fillna(0)
        return merged_predictions[column]
    
    def get_top_items(self, ratings, n=5):
        predictions_table = self.get_predictions(ratings)
        return predictions_table.sort_values(by='mean', ascending=False)[:n]
    
    def __positive_ratings(self, x):
        return x[x >= 4].count() / x.count()

    def __popularity(self, x):
        return x.count()


##Content-based recommender

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB


class ContentBasedRecommender(BaseRecommender):
    
    def get_predictions(self, ratings):
        profiles = self.__get_profiles(ratings)
        attrs, target = self.__get_attrs(ratings)
        classifier = GaussianNB()
        classifier.fit(attrs, target)
        return classifier
        
    def test_predictions(self, train_data, test_data):
        print('Learning profiles...')
        profiles = self.__get_profiles(train_data)
        print('Predicting...')
        test_attrs, test_target, test_user = self.__get_attrs_target_user(test_data)
        data = pd.concat([test_user, test_attrs], 1).astype(int)
        predictions = data.apply(lambda x: self.__predict_test(x, profiles), 1)
        predictions[predictions < 0] = 0
        predictions[predictions > 5] = 5
        return predictions
    
    
    def get_top_items(self, ratings, n=5):
        raise NotImplementedError()
        
    def __predict_test(self, row, profiles):
        try:
            user_id = int(row['userId'])
            user_profile = profiles.loc[user_id]['profile']
        except (IndexError, KeyError):
            print('User {} profile not known.'.format(row['userId']))
            return 3

        prediction = user_profile.predict(row.drop(['userId']))
        return prediction[0]
        
    def __get_attrs_target_user(self, ratings):
        data = movies_data.merge(ratings[['movieId', 'userId', 'rating']], left_index=True, right_on='movieId')
        attrs = data.drop(['title', 'movieId', 'userId', 'rating'], axis=1).fillna(0).astype(int)
        target = data['rating']
        user = data['userId']
        return attrs, target, user
    
    def __get_profiles(self, ratings):
        users = pd.DataFrame(ratings['userId'].unique(), columns=['userId'])
        users['profile'] = users['userId'].apply(lambda x: self.__get_user_profile(x, ratings))
        return users.set_index('userId')
            
    def __get_user_profile(self, user_id, ratings):
        user_ratings = ratings[ratings['userId']==user_id]
        attrs, target, user = self.__get_attrs_target_user(user_ratings)
        classifier = LinearRegression()
        classifier.fit(attrs, target)
        return classifier

            




## Collaborative filtering

In [None]:
class CollaborativeFilteringRecommender(BaseRecommender):
    pass

## Cross-validation

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import KFold
from random import sample

def rmse(actual, predictions):
    return np.sqrt(mean_squared_error(actual, predictions))

def evaluate_predictions(ratings, recommender, metrics=rmse):
    kf = KFold(n=ratings.shape[0], n_folds=5, shuffle=True)

    predictions = []
    actual = []

    for train, test in kf:
        train_data = ratings_data.iloc[train]
        test_data = ratings_data.iloc[test]
        actual.extend(test_data['rating'])
        test_pred = recommender.test_predictions(train_data, test_data)
        predictions.extend(test_pred)
    
    return metrics(actual, predictions)

error = evaluate_predictions(ratings_data, ContentBasedRecommender())
error