In [1]:
import time
import warnings
import numpy as np
import pandas as pd

from surprise import KNNWithMeans
from surprise import SVD
from surprise import Dataset
from surprise import Reader

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# Start of the algorithm
t_start = time.time()

# Read data

In [3]:
# build the reader and read the dataset
reader = Reader(line_format='user item rating timestamp', sep=r'::')
train = Dataset.load_from_file('./train.dat', reader=reader)
# build the training set
trainset = train.build_full_trainset()

In [4]:
# ------------------------------
label = pd.read_csv('./label.csv')

# Build the recommender system

## KNN with Means Algorithm

In [5]:
# define parameters
sim_options = {'name': 'msd',
               'user_based': False,
               'shrinkage': 100}

# build the algorithm
knn = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)
knn.train(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [6]:
# read the test file
test = pd.read_csv('./test.csv')

# make predictions
y_hat = np.zeros(len(test))
for i in range(len(test)):
    tmp = test.loc[i]
    uid = str(tmp['user'])
    iid = str(tmp['movie'])
    prediction = knn.predict(uid, iid, verbose=False)
    y_hat[i] = prediction.est

# save prediction
test['rating'] = y_hat
test.to_csv('./mysubmission1.csv', index=False)

In [7]:
# ------------------------------
# calculate RMSE
y = label['rating'].values

rmse = np.sqrt(np.mean((y_hat - y) ** 2))
print('Final RMSE is:\t', rmse, '\n')

Final RMSE is:	 0.89277009045 



## SVD Algorithm

In [8]:
# build the algorithm
svd = SVD(n_factors=100, n_epochs=20, biased=True, init_mean=0,
          init_std_dev=.1, lr_all=.005, reg_all=.02, lr_bu=None, 
          lr_bi=None, lr_pu=None, lr_qi=None, reg_bu=None, 
          reg_bi=None, reg_pu=None, reg_qi=None, verbose=False)
svd.train(trainset)

In [9]:
# read the test file
test = pd.read_csv('./test.csv')

# make predictions
y_hat = np.zeros(len(test))
for i in range(len(test)):
    tmp = test.loc[i]
    uid = str(tmp['user'])
    iid = str(tmp['movie'])
    prediction = svd.predict(uid, iid, verbose=False)
    y_hat[i] = prediction.est

# save prediction
test['rating'] = y_hat
test.to_csv('./mysubmission2.csv', index=False)

In [10]:
# ------------------------------    
# calculate RMSE
y = label['rating'].values

rmse = np.sqrt(np.mean((y_hat - y) ** 2))
print('Final RMSE is:\t', rmse, '\n')

Final RMSE is:	 0.891754252343 



# Running time information

In [11]:
# get the total code used time information
t_end = time.time()
print("Program running time")
print('Total time:\t{0:8.2f} seconds'.format(t_end - t_start))

Program running time
Total time:	  146.71 seconds
