In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('training.txt',header=None,sep=",")
train.columns=['user','item','rating']

In [3]:
train.head()

Unnamed: 0,user,item,rating
0,1,1,4.0
1,2,2,2.5
2,4,4,3.0
3,5,5,5.0
4,6,6,3.5


In [4]:
train.shape

(19996916, 3)

In [5]:
train['rating'] = train['rating'].astype('float32')

In [6]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import KFold
from surprise import accuracy

In [7]:
reader = Reader(rating_scale = (0, 5))
data = Dataset.load_from_df(train, reader)

In [8]:
svd = SVD(n_factors = 100, n_epochs = 20, random_state = 0)
cv = KFold(n_splits = 10, random_state = 0)

In [9]:
cv_rmse = []
cv_counter=0
for trainset, valset in cv.split(data):
    svd.fit(trainset)
    pred = svd.test(valset)
    cv_rmse.append(accuracy.rmse(pred, verbose=True)) # RMSE of each fold
    cv_counter+=1
    print(str(cv_counter) + " iteration(s) finished!")

RMSE: 0.7835
1 iteration(s) finished!
RMSE: 0.7841
2 iteration(s) finished!
RMSE: 0.7837
3 iteration(s) finished!
RMSE: 0.7827
4 iteration(s) finished!
RMSE: 0.7834
5 iteration(s) finished!
RMSE: 0.7835
6 iteration(s) finished!
RMSE: 0.7835
7 iteration(s) finished!
RMSE: 0.7835
8 iteration(s) finished!
RMSE: 0.7836
9 iteration(s) finished!
RMSE: 0.7830
10 iteration(s) finished!


In [10]:
print('Average validation RMSE from 10-Fold CV:', np.mean(cv_rmse))

Average validation RMSE from 10-Fold CV: 0.7834471012261274


In [11]:
trainset = data.build_full_trainset()

In [13]:
test = pd.read_csv('testing.txt',header=None,sep=",")
test.columns=['user','item','rating']

In [14]:
test.shape

(5003179, 3)

In [15]:
test.head()

Unnamed: 0,user,item,rating
0,3,3,?
1,9,9,?
2,10,10,?
3,11,11,?
4,19,19,?


In [16]:
test.dtypes

user       int64
item       int64
rating    object
dtype: object

In [17]:
predictions = []
svd.fit(trainset)
for i in range(len(test)):
    u = test.iloc[i, 0]
    i = test.iloc[i, 1]
    predictions.append(svd.predict(u, i)[3]) # for each user-item pair in test set, predict rating

In [18]:
len(predictions)

5003179

In [19]:
test['rating'] = predictions

In [20]:
test['rating'] = round(test['rating'],2)
test.head()

Unnamed: 0,user,item,rating
0,3,3,3.48
1,9,9,4.36
2,10,10,2.43
3,11,11,3.42
4,19,19,3.97


In [21]:
test.to_csv('predictions_final.txt',sep=',',index=False)