In [1]:
import surprise

In [2]:
import tensorflow as tf

In [66]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate, train_test_split
import zipfile
from surprise import Reader, Dataset, SVD
from surprise import accuracy
import random
from random import randint

<h4> Read in data

In [41]:
# Read data into an array of strings
with open('./ml-100k/u.data') as f:
    all_lines = f.readlines()

# Prepare the data to be used in Surprise
reader_other = Reader(line_format='user item rating timestamp', sep='\t')
data_other = Dataset.load_from_file('./ml-100k/u.data', reader=reader_other)

In [5]:
def strip_content(data):
    return r_unwanted.sub(", ", data)

In [6]:
def create_dataframe(data):
    df_data = pd.DataFrame(data)

    df_data[0] = df_data[0].apply(strip_content)

    foo = lambda x: pd.Series([i for i in (x.split(','))])

    df_final = df_data[0].apply(foo)

    df_final.rename(columns={0:'userID', 1:'movieID', 2:'rating', 3: 'timestamp'}, 
                     inplace=True)

    df_final = df_final.drop(columns=[4])
    
    return df_final

In [549]:
df_final = create_dataframe(all_lines)

In [144]:
# sim_options = {
#     "name": ["msd", "cosine"],
#     "min_support": [3, 4, 5],
#     "user_based": [False, True],
# }

# param_grid = {"sim_options": sim_options}

# gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
# gs.fit(data)

# print(gs.best_score["rmse"])
# print(gs.best_params["rmse"])

<h4> Grid search for best params </h4>

In [10]:
# data

In [11]:
# param_grid = {
#     "n_epochs": [5, 10],
#     "lr_all": [0.002, 0.005],
#     "reg_all": [0.4, 0.6]
# }
# gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

# gs.fit(data)

# print(gs.best_score["rmse"])
# print(gs.best_params["rmse"])

<h4> SVD algo chosen, with best params obtained from GS

In [7]:
algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)

<h4>Cross validation, 5 folds

In [12]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9558  0.9624  0.9699  0.9590  0.9600  0.9614  0.0047  
MAE (testset)     0.7654  0.7715  0.7773  0.7686  0.7693  0.7704  0.0040  
Fit time          4.64    3.77    4.87    4.72    4.45    4.49    0.39    
Test time         0.18    0.24    0.25    0.34    0.16    0.24    0.07    


{'test_rmse': array([0.95582906, 0.96236882, 0.96985865, 0.95903964, 0.95997643]),
 'test_mae': array([0.76539136, 0.77151636, 0.77731087, 0.76855886, 0.7693242 ]),
 'fit_time': (4.642159700393677,
  3.7680797576904297,
  4.870599031448364,
  4.724268674850464,
  4.45410943031311),
 'test_time': (0.17978334426879883,
  0.2381908893585205,
  0.2549247741699219,
  0.3442046642303467,
  0.1583545207977295)}

<h4>Create train and test set, apply predictions on test set

In [8]:
def RMSE_predict_train_test(data):
    # sample random trainset and testset
    # test set is made of 25% of the ratings.
    trainset, testset = train_test_split(data, test_size=.25)

    # We'll use the famous SVD algorithm.
    algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Then compute RMSE
    return accuracy.rmse(predictions)

<h4> Training Time
   

In [9]:

import timeit

def training_time(data):
    start = timeit.default_timer()

    trainset = data.build_full_trainset()

    algo.fit(trainset)


    stop = timeit.default_timer()

    print('Time: ', stop - start)  

In [10]:
training_time(data_other)

Time:  2.2829717


<h4>Predict scores for all users for all movies. Test the speed of the predictions.

In [11]:
def predict_scores(data):
    # Build Training set. Needed to fit to create model.
    trainset = data.build_full_trainset()
    
    algo.fit(trainset)
    
    # Get all the user and item IDs
    user_ids = trainset.all_users()
    item_ids = trainset.all_items()

    # Create empty list to store predictions
    ratings = []
    
    # For loop, estimate rating of each user for every movie.
    for user_id in user_ids:
        for item_id in item_ids:
            
            prediction = algo.predict(str(user_id), str(item_id)).est
#             print(prediction)
            ratings.append(prediction)
            
    return ratings

<h4> Time to predict all ratings

In [12]:
import timeit

start = timeit.default_timer()


ratings = predict_scores(data_other)
    
stop = timeit.default_timer()

print('Time: ', stop - start)  

Time:  11.7596633


<h4> Create Dataframe

In [511]:
def predict_scores_df(data, df):
    # Build Training set. Needed to fit to create model.
    trainset = data.build_full_trainset()
    
    algo.fit(trainset)
    
    # Get all the user and item IDs
    user_ids = trainset.all_users()
    item_ids = trainset.all_items()

    # Create empty list to store predictions
    ratings = {}
    ratings_list = []
    # For loop, estimate rating of each user for every movie.
    for user_id in user_ids:
        for item_id in item_ids:
            
            prediction = algo.predict(str(user_id), str(item_id)).est
#             print(prediction)
            ratings['userID'] = str(user_id)
            ratings['movieID'] = str(item_id)
            ratings['rating'] = prediction
            
            ratings_list.append(ratings)
            
            ratings = {}
            
    return ratings_list

In [515]:
import timeit

start = timeit.default_timer()

predicted_scores_dict =  predict_scores_df(data, df_final)
    
stop = timeit.default_timer()

print('Time: ', stop - start)  


Time:  10.704530599999998


<h4> Dataframe with labels

In [519]:
prediced_scores_dict = predict_scores_df(data, df_final)

In [520]:
df_ratings_all_users = df_final.append(predicted_scores_dict)

Unnamed: 0,userID,movieID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
1586121,942,1677,3.8956,
1586122,942,1678,3.82475,
1586123,942,1679,3.93498,
1586124,942,1680,3.85935,


In [561]:
import random

In [61]:
# def create_data(data):
#     trainset = data.build_full_trainset()
    
#     user_ids = trainset.all_users()
    
#     data_list = []
#     for user_id in user_ids:
#         for movie in range(item_ids[-1]+1, item_ids[-1]*100):

#             new_data = {}

#             new_data['userID'] = str(user_id)
#             new_data['movieID'] = str(movie)
#             new_data['rating'] = random.randint(1,5)

#             data_list.append(new_data)
#     return data_list

In [62]:
# def create_data_other(data):
#     trainset = data.build_full_trainset()

#     user_ids = trainset.all_users()
#     item_ids = trainset.all_items()

#     data_list = []
#     for user_id in user_ids:
#         for movie in range(item_ids[-1]+1, item_ids[-1]*60):

#             new_data = str(user_id)+'\t'+str(movie)+'\t'+str(random.randint(1,5))+'\t'+'NaN\n'

#             data_list.append(new_data)
#     return data_list

<h4> Create extended dataset

In [56]:
def create_data_other(data):
    trainset = data.build_full_trainset()

    user_ids = trainset.all_users()
    item_ids = trainset.all_items()

    data_list = []
    # Create new movies (168200 in total)
    for movie in range(item_ids[-1]+1, item_ids[-1]*100):
        # For every movie, there will be 100 users rating the movie
        user_generated = [randint(0, user_ids[-1]) for p in range(0, 100)]
        for user in user_generated:
            # Create a random generated score for the movies
            new_data = str(user)+'\t'+str(movie)+'\t'+str(random.randint(1,5))+'\t'+'NaN\n'

            data_list.append(new_data)
    return data_list

In [58]:
data_newest = create_data_other(data_other)

Create the new data (as a list)

In [59]:
new_data = []

new_data.extend(all_lines)
new_data.extend(data_newest)

Check difference in sizes


In [60]:
len(data_newest)/len(all_lines)

166.418

Save file on filesystem

In [67]:
# # Read data into an array of strings
with open('./new_data.data') as f:
    all_lines = f.readlines()

Load data

In [69]:

# Prepare the data to be used in Surprise
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file('./new_data.data', reader=reader)

Calculate prediction time

In [None]:
import timeit

start = timeit.default_timer()


ratings = predict_scores(data)
    
stop = timeit.default_timer()

print('Time: ', stop - start)  