# SVD++

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVDpp, accuracy
from surprise.model_selection import GridSearchCV, train_test_split
import matplotlib.pyplot as plt
from collections import defaultdict
from typing import Tuple, Callable
import matplotlib.pyplot as plt
import os
import pickle


# Set random seed for reproducibility
np.random.seed(42)

DATA_DIR = '../../Data/'
train_path = os.path.join(DATA_DIR, 'train_ratings.csv')
tbr_path = os.path.join(DATA_DIR, 'train_tbr.csv')
submission_path = os.path.join(DATA_DIR, 'sample_submission.csv')
output_path = os.path.join(DATA_DIR, 'sample_submissionSVDplus.csv')

In [2]:
# Load data
def load_data(train_path):
    """
    Load ratings CSV, split sid_pid into sid and pid, drop original column.
    """
    ratings = pd.read_csv(train_path)
    ratings[["sid", "pid"]] = ratings["sid_pid"].str.split("_", expand=True)
    ratings.drop(columns=["sid_pid"], inplace=True)
    ratings["sid"] = ratings["sid"].astype(int)
    ratings["pid"] = ratings["pid"].astype(int)
    return ratings


def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data."""
    
    train_ratings = pd.read_csv('./Data/train_ratings.csv')
    train_tbr = pd.read_csv('./Data/train_tbr.csv') 

    # Split sid_pid into sid and pid columns
    train_ratings[["sid", "pid"]] = train_ratings["sid_pid"].str.split("_", expand=True)
    train_ratings = train_ratings.drop("sid_pid", axis=1)
    train_ratings["sid"] = train_ratings["sid"].astype(int)
    train_ratings["pid"] = train_ratings["pid"].astype(int)
    train_tbr["sid"] = train_tbr["sid"].astype(int)
    train_tbr["pid"] = train_tbr["pid"].astype(int)
    
    return train_ratings, train_tbr

def build_dataset(ratings, rating_scale=(1,5)):
    """
    Convert a pandas DataFrame to a Surprise Dataset.
    """
    reader = Reader(rating_scale=rating_scale)
    return Dataset.load_from_df(ratings[["sid", "pid", "rating"]], reader)


def generate_submission(algo, dataset, submission_path, output_path):
    """
    Fit algo on full training set, predict on sample_submission file, and save to CSV.
    """
    trainset = dataset.build_full_trainset()
    algo.fit(trainset)

    sub = pd.read_csv(submission_path)
    sub[["sid", "pid"]] = sub["sid_pid"].str.split("_", expand=True).astype(int)
    testset = list(zip(sub["sid"], sub["pid"], sub["rating"]))

    preds = algo.test(testset)
    sub["rating"] = [p.est for p in preds]
    out = sub[["sid_pid", "rating"]]
    out.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")

In [3]:
# Prepare the data for Surprise
ratings = load_data(train_path)
data = build_dataset(ratings)

# Split the data for evaluation
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
def run_search(estimator_class, param_grid, data, cv=5, n_jobs=-1, search_type="grid", n_iter=50, random_state=42):
    """
    Perform hyperparameter search. Returns search object and DataFrame of cv results.
    """
    if search_type == "grid":
        search = GridSearchCV(estimator_class, param_grid, measures=["rmse"], cv=cv, n_jobs=n_jobs)
    elif search_type == "random":
        search = RandomizedSearchCV(
            estimator_class,
            param_grid,
            measures=["rmse"],
            n_iter=n_iter,
            cv=cv,
            random_state=random_state,
            n_jobs=n_jobs,
        )
    else:
        raise ValueError("search_type must be 'grid' or 'random'")

    svdpp_model = search.fit(data)
    # === Save the trained NMF model to disk ===
    nmf_model_path = os.path.join(DATA_DIR, 'svdpp_model.pkl')

    with open(nmf_model_path, 'wb') as f:
        pickle.dump(svdpp_model, f)

    print(f"SVDpp model saved to {nmf_model_path}")
    results = pd.DataFrame(search.cv_results)
    return search, results

In [34]:
# Hyperparameter search
# Define parameter grid for grid search
param_grid = {
    'n_factors': [24,27,30,35,40,50,55,60,65,70,200], 
    'n_epochs': [40,50,55,60,65,70,75,80,90,200], 
    'lr_all': [0.0001,0.002,0.003910657605171608,0.005], 
    'reg_all': [0.02,0.03,0.04,0.04309994347823631,0.05,0.055,0.06], 
    'verbose': [True]
}
               
# Perform grid search
gs, results = run_search(SVDpp, param_grid, data, search_type='grid')

# Print best parameters
best_params = gs.best_params['rmse']
print(f"Best parameters: {best_params}")
print(f"Best RMSE: {gs.best_score['rmse']}")

 processing epoch 0
 processing epoch 0
 processing epoch 0
 processing epoch 0
 processing epoch 0
 processing epoch 1
 processing epoch 1
 processing epoch 1
 processing epoch 1
 processing epoch 1
 processing epoch 2
 processing epoch 2
 processing epoch 2
 processing epoch 2
 processing epoch 2
 processing epoch 3
 processing epoch 3
 processing epoch 3
 processing epoch 3
 processing epoch 3
 processing epoch 4
 processing epoch 4
 processing epoch 4
 processing epoch 4
 processing epoch 4
 processing epoch 5
 processing epoch 5
 processing epoch 5
 processing epoch 5
 processing epoch 5
 processing epoch 6
 processing epoch 6
 processing epoch 6
 processing epoch 6
 processing epoch 6
 processing epoch 7
 processing epoch 7
 processing epoch 7
 processing epoch 7
 processing epoch 7
 processing epoch 8
 processing epoch 8
 processing epoch 8
 processing epoch 8
 processing epoch 8
 processing epoch 9
 processing epoch 9
 processing epoch 9
 processing epoch 9
 processing epoch 9


In [16]:
# Train the model with best parameters
best_svdpp = SVDpp(
    n_factors=65,
    n_epochs=70,
    lr_all=0.003910657605171608,
    reg_all=0.055,
    verbose=True
)

In [17]:
# Best model from the first part
generate_submission(best_svdpp, data, submission_path, output_path)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing epoch 47
 p

## Adding Implicit rating

In [20]:
def load_tbr_data(train_path):
    """
    Load ratings CSV, split sid_pid into sid and pid, drop original column.
    """
    implicit_df = pd.read_csv(train_path)
    implicit_df["sid"] = implicit_df["sid"].astype(int)
    implicit_df["pid"] = implicit_df["pid"].astype(int)
    return implicit_df


def impute_weighted_wide(mat: pd.DataFrame, implicit_df: pd.DataFrame, row_weight: float = 1, col_weight: float = 0) -> pd.DataFrame:
    """
    mat: pivoted DataFrame, index=sid, columns=pid
    implicit_df: [sid,pid] pairs to fill
    """

    # get the raw numpy matrix
    A = mat.values.copy()
    # compute row‐means / column‐means once
    rmean = np.nanmean(A, axis=1)
    cmean = np.nanmean(A, axis=0)

    # turn sids/pids into positional indices
    ridx = mat.index.get_indexer(implicit_df["sid"])
    cidx = mat.columns.get_indexer(implicit_df["pid"])

    # vectorized compute of all new entries
    vals = row_weight * rmean[ridx] + col_weight * cmean[cidx]
    A[ridx, cidx] = vals

    return pd.DataFrame(A, index=mat.index, columns=mat.columns)

In [21]:
# Load data
ratings = load_data(train_path)
implicit_data = load_tbr_data(tbr_path)

mat = ratings.pivot(index="sid", columns="pid", values="rating")
#print(mat)

# Stack ratings into long format
ratings_data = ratings[['sid', 'pid', 'rating']].copy()


# Mark implicit data: same structure, rating = np.nan (or ignored)
implicit_data = implicit_data[['sid', 'pid']].copy()
implicit_data['rating'] = np.nan  # Placeholder — used only to signal interaction

full_data = pd.concat([ratings_data, implicit_data], ignore_index=True)

# Impute values
imputed_data = impute_weighted_wide(mat, implicit_data, row_weight=0.75, col_weight=0.25)
imputed_data = imputed_data.stack().reset_index(name="rating").rename(columns={"level_0":"sid", "level_1":"pid"})

# Build Dataset
dataset = build_dataset(imputed_data)

In [23]:
model = SVDpp(n_epochs=70, n_factors=65, lr_all=0.003910657605171608, reg_all=0.055, verbose= True)
generate_submission(best_svdpp, dataset, submission_path, output_path)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
 processing epoch 30
 processing epoch 31
 processing epoch 32
 processing epoch 33
 processing epoch 34
 processing epoch 35
 processing epoch 36
 processing epoch 37
 processing epoch 38
 processing epoch 39
 processing epoch 40
 processing epoch 41
 processing epoch 42
 processing epoch 43
 processing epoch 44
 processing epoch 45
 processing epoch 46
 processing epoch 47
 p