# Model Building:

## Singular Value Decomposition ++ (SVD++):

### Import necessary libraries

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

INTERIM_DATA_DIR = "../data/interim/"

### Load the dataset

In [5]:
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate


reader = Reader(line_format="user item rating timestamp", sep=',', skip_lines=1, rating_scale=(0, 1))
data = Dataset.load_from_file(f'{INTERIM_DATA_DIR}merged.csv', reader=reader)

# Define the SVD++ algorithm
algo = SVDpp()

# Perform cross-validation to evaluate the algorithm
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# To make predictions for a single user, you can train on the whole dataset and predict
trainset = data.build_full_trainset()
algo.fit(trainset)

# Predict a rating for a single user and item
user_id = 1  # Replace with the actual user ID
item_id = 1  # Replace with the actual item ID
predicted = algo.predict(user_id, item_id)

print(predicted.est)


Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1896  0.1891  0.1881  0.1893  0.1901  0.1892  0.0007  
MAE (testset)     0.1497  0.1494  0.1492  0.1493  0.1504  0.1496  0.0004  
Fit time          6.53    6.48    6.51    6.53    6.51    6.52    0.02    
Test time         1.62    1.59    1.60    1.63    1.59    1.60    0.02    
0.5059785978597859


In [7]:
from collections import defaultdict

def get_top_n_recommendations(predictions, n=10):
    # Map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# First, train the SVDpp algorithm on the full dataset
trainset = data.build_full_trainset()
algo = SVDpp()
algo.fit(trainset)

# Then predict ratings for all pairs (u, i) that are NOT in the training set
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

# Get top n recommendations for each user
top_n = get_top_n_recommendations(predictions, n=5)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])


196 ['114', '408', '483', '318', '178']
305 ['114', '641', '657', '1524', '659']
6 ['114', '320', '1449', '603', '657']
234 ['408', '169', '114', '320', '272']
63 ['12', '318', '114', '64', '1449']
181 ['114', '64', '318', '483', '178']
201 ['114', '169', '1449', '178', '657']
249 ['657', '178', '1524', '134', '320']
13 ['408', '169', '513', '1449', '251']
279 ['483', '641', '318', '178', '1449']
145 ['114', '318', '408', '483', '169']
90 ['169', '408', '114', '251', '50']
271 ['408', '1449', '483', '114', '923']
18 ['114', '1449', '511', '251', '1143']
1 ['408', '318', '513', '483', '357']
207 ['169', '408', '178', '1449', '603']
14 ['64', '114', '318', '1449', '483']
113 ['1449', '64', '169', '1194', '641']
123 ['1449', '408', '169', '641', '114']
296 ['408', '178', '318', '169', '613']
154 ['408', '169', '1449', '64', '483']
270 ['963', '114', '318', '178', '1103']
240 ['408', '1449', '189', '169', '641']
144 ['408', '114', '483', '178', '513']
21 ['114', '64', '169', '641', '483']
