In [2]:
# Imports
import pandas as pd
import numpy as np
import scipy as sp
from matplotlib import pyplot as plt

In [5]:
# Read file and append information to respective dictionaries
movieDict = {'movieId':[],'userId':[],'rating':[]}
with open('combined_data_1.txt', 'r') as inFile:
    curMovie = 0
    for line in inFile.readlines():
        if len(line.split(":")) == 2:
            curMovie = int(line.strip(':\n'))
        else:
            splitLine = line.split(",")
            lineVals = [int(splitLine[0].strip()), int(splitLine[1].strip())]
            movieDict['movieId'].append(curMovie)
            movieDict['userId'].append(lineVals[0])
            movieDict['rating'].append(lineVals[1])

In [6]:
# Generate dataframe for movieSet
movieFrame = pd.DataFrame(movieDict)
del movieDict

In [7]:
movieFrame

Unnamed: 0,movieId,userId,rating
0,1,1488844,3
1,1,822109,5
2,1,885013,4
3,1,30878,4
4,1,823519,3
5,1,893988,3
6,1,124105,4
7,1,1248029,3
8,1,1842128,4
9,1,2238063,3


In [8]:
# Imports
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVD, SVDpp
from surprise.prediction_algorithms import BaselineOnly
from surprise.model_selection import train_test_split, KFold
import surprise.accuracy as accuracy
import time

# Gaurantees all folds are equivalent
kf = KFold(random_state=0)

In [36]:
# Import into new loc in scikit
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movieFrame[['userId', 'movieId', 'rating']], reader)

_, testing = train_test_split(data, test_size=.3)

In [11]:
# SVD algorithm
algoSVD = SVD()

# start
tS = time.time()

# train on training set
resSVD = cross_validate(algoSVD, data, measures=['RMSE'], cv=5, verbose=True)

# measure overall time
tE = time.time()
print("Time Elapsed [SVD]: {}".format(tE-tS))

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9000  0.9000  0.9007  0.9005  0.9008  0.9004  0.0003  
Fit time          930.54  965.23  948.88  956.08  945.77  949.30  11.53   
Test time         69.06   60.97   73.53   56.09   65.58   65.05   6.09    
Time Elapsed [SVD]: 5313.096236944199


### SVD++ would generally perform the best, but with a settling time of 2 hours for 1M entries, it's realistically impossible to run this on even one part of the Neflix data so that it can be ran multiple times within the span of the project. Therefore, we test it only on 10% of the data and evaluate it on the same 30% used for the other sets. We should be careful to compare the performance to other values because even though the solution space is the same, performance obviously takes a hit as not much data is there to support it

In [26]:
"""SVD++ is MUCH more computationally intensive than the biased SVD
    implementation and scales up on the order of >O(N^3)! As a result,
    we only take 10% of our dataset in order to meet the constraints
    of the project. (Runtime generally is around 2 HRS FOR 1MIL ENTRIES!)
    Not possible within time constraint!!!"""
#split [Yes this is very little, but it's mostly done as an example]
trainingSVDpp, _ = train_test_split(data, test_size=.90)

# SVD++ algorithm
algo2 = SVDpp(verbose=True)

# start
tS = time.time()

# train on training set
resSVDpp = algo2.fit(trainingSVDpp)

# measure overall time
tE = time.time()
print("Time Elapsed [SVD++]: {}".format(tE-tS))

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
Time Elapsed [SVD++]: 549.9810678958893


In [22]:
# baseline algorithm
algo3 = BaselineOnly()

# start
tS = time.time()

# train on training set
resBASE = cross_validate(algo3, data, measures=['RMSE'], cv=5, verbose=True)

# measure overall time
tE = time.time()
print("Time Elapsed [Baseline]: {}".format(tE-tS))

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9329  0.9330  0.9324  0.9325  0.9333  0.9328  0.0003  
Fit time          92.88   102.66  103.08  101.77  106.83  101.44  4.62    
Test time         60.14   58.09   59.15   59.09   52.84   57.86   2.59    
Time Elapsed [Baseline]: 1044.3228101730347


In [27]:
# compute RMSE using any algorithm on testing dataset
predictions = algoSVD.test(testing)
SVDacc = accuracy.rmse(predictions)
predictions = algo2.test(testing)
SVDppacc = accuracy.rmse(predictions)
predictions = algo3.test(testing)
BASEacc = accuracy.rmse(predictions) 

RMSE: 0.7888
RMSE: 0.9382
RMSE: 0.9222


In [39]:
from surprise.prediction_algorithms import SlopeOne

# slope one algorithm
algoSO = SlopeOne()

# start
tS = time.time()

# train on training set
resKNN = cross_validate(algoKNN, data, measures=['RMSE'], cv=5, verbose=True)

# measure overall time
tE = time.time()
print("Time Elapsed [SlopeOne]: {}".format(tE-tS))

Evaluating RMSE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9307  0.9309  0.9310  0.9309  0.9300  0.9307  0.0003  
Fit time          132.39  136.86  131.75  138.27  126.13  133.08  4.28    
Test time         494.14  480.06  483.93  467.03  465.15  478.06  10.82   
Time Elapsed [SlopeOne]: 3294.1948626041412


In [40]:
predictions = algoSO.test(testing)
SOacc = accuracy.rmse(predictions) 

RMSE: 0.9074


### At this point, we just try evaluating everything on a tiny subset given that a naive algorithm like KNN would obviously generate a similarity matrix far too large to store with more than 50k users.

In [41]:
movieMaxCount = 10
print("In the first {} movies, there are {} unique users and {} entries".format(movieMaxCount,
        len(movieFrame[movieFrame['movieId'] <= movieMaxCount].userId.unique()), 
        movieFrame[movieFrame['movieId'] <= movieMaxCount].shape[0]))
newDataSet = movieFrame[movieFrame['movieId'] <= movieMaxCount]

# Now proceed to perform predictions based on smaller dataset
dataSmall = Dataset.load_from_df(newDataSet[['userId', 'movieId', 'rating']], reader)

In the first 10 movies, there are 19519 unique users and 20352 entries


In [35]:
# similarity algorithm
sim_options = {'name': 'pearson_baseline', 'shrinkage': 0}  # shrinkage=0 => no using baseline means to improve acc
algoKNNSmall = KNNBasic(sim_options=sim_options, verbose = True)

# start
tS = time.time()

# train on training set
resKNN = cross_validate(algoKNN, dataSmall, measures=['RMSE'], cv=5, verbose=True)

# measure overall time
tE = time.time()
print("Time Elapsed [KNN-Pearson-NoBaseLine]: {}".format(tE-tS))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2747  1.3056  1.2968  1.3010  1.2908  1.2938  0.0107  
Fit time          17.08   17.59   17.94   17.87   17.11   17.52   0.36    
Test time         0.40    0.45    0.39    0.51    0.40    0.43    0.04    
Time Elapsed [KNN-Pearson-NoBaseLine]: 89.82383394241333


### And we measure the results compared to the other algorithms as well. This time it's comparable as all will be using the same small dataset with cross-validation present

In [45]:
# SVD algorithm
algoSVDsm = SVD()

# start
tS = time.time()

# train on training set
resSVDsm = cross_validate(algoSVDsm, dataSmall, measures=['RMSE'], cv=5, verbose=True)

# measure overall time
tE = time.time()
print("Time Elapsed [SVD]: {}".format(tE-tS))

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2801  1.2721  1.2827  1.2656  1.2508  1.2703  0.0114  
Fit time          0.74    0.74    0.74    0.75    0.74    0.74    0.01    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Time Elapsed [SVD]: 3.8925845623016357


In [46]:
"""SVD++ is MUCH more computationally intensive than the biased SVD
    implementation and scales up on the order of >O(N^3)! As a result,
    we only take 10% of our dataset in order to meet the constraints
    of the project. (Runtime generally is around 2 HRS FOR 1MIL ENTRIES!)
    Not possible within time constraint!!!"""
# SVD++ algorithm
algo2sm = SVDpp()

# start
tS = time.time()

# train on training set
resSVDppsm = cross_validate(algo2sm, dataSmall, measures=['RMSE'], cv=5, verbose=True)

# measure overall time
tE = time.time()
print("Time Elapsed [SVD++]: {}".format(tE-tS))

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3198  1.2757  1.3125  1.2896  1.2851  1.2966  0.0168  
Fit time          1.17    1.20    1.20    1.17    1.17    1.18    0.02    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Time Elapsed [SVD++]: 6.113745450973511


In [47]:
from surprise.prediction_algorithms import SlopeOne

# slope one algorithm
algoSOsm = SlopeOne()

# start
tS = time.time()

# train on training set
resSOsm = cross_validate(algoSOsm, dataSmall, measures=['RMSE'], cv=5, verbose=True)

# measure overall time
tE = time.time()
print("Time Elapsed [SlopeOne]: {}".format(tE-tS))

Evaluating RMSE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3116  1.3011  1.3100  1.3107  1.3022  1.3071  0.0045  
Fit time          0.10    0.09    0.10    0.10    0.10    0.10    0.00    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Time Elapsed [SlopeOne]: 5.866800785064697


In [50]:
# baseline algorithm
algo3sm = BaselineOnly()

# start
tS = time.time()

# train on training set
resBASEsm = cross_validate(algo3sm, dataSmall, measures=['RMSE'], cv=5, verbose=True)

# measure overall time
tE = time.time()
print("Time Elapsed [Baseline]: {}".format(tE-tS))

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2740  1.2600  1.2795  1.2733  1.2602  1.2694  0.0079  
Fit time          0.04    0.04    0.04    0.04    0.05    0.04    0.01    
Test time         0.01    0.01    0.02    0.01    0.02    0.02    0.00    
Time Elapsed [Baseline]: 0.3749959468841553
