In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from surprise import SVD
from surprise import Dataset
from surprise import evaluate
from surprise import print_perf
from surprise import Reader

warnings.filterwarnings('ignore')
%matplotlib inline

# Read and process the data

In [2]:
# read ratings and movies data
ratings = pd.read_table('./data/ratings.dat', sep=r'::', header=None, engine='python', 
                        names=['user', 'movie', 'rating', 'timestamp'])

users = pd.read_table('./data/users.dat', sep=r'::', header=None, engine='python', 
                       names=['user', 'gender', 'age', 'occupation', 'zip-code'])

movies = pd.read_table('./data/movies.dat', sep=r'::', header=None, engine='python', 
                       names=['movie', 'title', 'genres'])
# change multiple genres into 'Multiple'
idx = np.array(list(map(lambda x: '|' in x, movies['genres'])))
movies['genres'][idx] = 'Multiple'

In [3]:
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
users.head()

Unnamed: 0,user,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
movies.head()

Unnamed: 0,movie,title,genres
0,1,Toy Story (1995),Multiple
1,2,Jumanji (1995),Multiple
2,3,Grumpier Old Men (1995),Multiple
3,4,Waiting to Exhale (1995),Multiple
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# merge ratings and movies together on 'movie'
merged = pd.merge(ratings, movies, on='movie', how='inner')
merged.head()

Unnamed: 0,user,movie,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


# Build recommender system

In [7]:
# build the reader and read the dataset
reader = Reader(line_format='user item rating timestamp', sep=r'::')
train = Dataset.load_from_file('./train.dat', reader=reader)

In [8]:
# split into 5 folders for cross validation
train.split(n_folds=5, shuffle=True)

# define algorithms
algo = SVD()

# evaluate the performance of the algorithms
perf = evaluate(algo, train, measures=['RMSE'], verbose=False)
print_perf(perf)

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.9039  0.9035  0.9089  0.9087  0.9024  0.9055  


# Make prediction and calcute RMSE error

In [9]:
# read the test file
test = pd.read_csv('./test.csv')
label = pd.read_csv('./label.csv')

In [10]:
# make predictions
prediction = []
for i in range(len(test)):
    tmp = test.loc[i]
    uid = str(tmp['user'])
    iid = str(tmp['movie'])
    pred = algo.predict(uid, iid, verbose=False)
    prediction.append(pred.est)

In [11]:
pred = algo.predict(uid='1', iid='1193', verbose=False)

In [12]:
# calculate RMSE
y = label['rating'].values
y_hat = np.array(prediction)

rmse = np.sqrt(np.mean((y_hat - y) ** 2))
print('Final RMSE is:\t', rmse, '\n')

Final RMSE is:	 0.90389473186 

