In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error as mae
import copy
from scipy.sparse.linalg.dsolve import linsolve
from scipy.sparse.linalg import cg
import matplotlib.pyplot as plt
import random
random.seed(42)
np.random.seed(42)
import warnings
warnings.filterwarnings("ignore")

In [27]:
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
def makeDataset(fold):
    base = pd.read_csv("/content/drive/MyDrive/CFAssignmentsDataset/u" + str(fold) + ".base", sep = "\t", names = columns, encoding = "latin-1")
    test = pd.read_csv("/content/drive/MyDrive/CFAssignmentsDataset/u" + str(fold) + ".test", sep = "\t", names = columns, encoding = "latin-1")
    
    return base, test

def makeYandRMatrix(data):
    totalUsers = len(data['user_id'].unique())
    totalMovies = 1682
    Y = np.zeros([totalUsers+1, totalMovies+1])
    R = np.zeros([totalUsers+1, totalMovies+1])

    for i in range(data.shape[0]):
        userId = data['user_id'][i]
        movieId = data['movie_id'][i]
        curRating = data['rating'][i]

        Y[userId][movieId] = curRating 
        R[userId][movieId] = 1

    return Y, R

def uandVInitialization(f = 10, initType = "random", totalUsers = 943, totalMovies = 1682):
    if initType == "zero":
        U = np.zeros([totalUsers+1, f])
        V = np.zeros([f, totalMovies+1])

    elif initType == "random":
        U = np.random.uniform(size=(totalUsers+1, f), low=0, high=1)
        V = np.random.uniform(size=(f, totalMovies+1), low=0, high=1)

    return U, V


def train(Y, R, U, V, t, max_iters = 10, method = "als-inv"):
    nmaes = []
    n = U.shape[1]
    lmbd = 0.6*np.identity(n)
    X = np.random.uniform(size=(944, 1683), low=1, high=5)

    for i in range(max_iters):
        B = X + Y - np.multiply(R, np.matmul(U, V))
        
        if (method == "als-inv"): # PseudoInverse + regularization
            V = np.matmul(np.matmul(np.linalg.inv((np.matmul(U.T, U) + lmbd)), U.T), B)
            U = (np.matmul(np.matmul(np.linalg.inv(np.matmul(V, V.T) + lmbd), V), B.T)).T
        elif (method == "als-ds"): # Direct Solver + regularization
            V = linsolve.spsolve(np.matmul(U.T, U) + lmbd, np.matmul(U.T, B))
            U = linsolve.spsolve(np.matmul(V, V.T) + lmbd, np.matmul(V, B.T)).T
        elif (method == "als-cg"): # CG + regularization
            for j in range(1683):
                V[: , j], c1 = cg(np.matmul(U.T, U) + lmbd, np.matmul(U.T, B[:, j]))
            for j in range(944):
                u_, c2 = cg(np.matmul(V, V.T) + lmbd, np.matmul(V, B.T[:, j]))
                U[j, :] = u_
        
        X = np.matmul(U, V)
        rPred = predictRating(U, V, t)
        nmaes.append(nmae(t, rPred))
           
    return U, V, nmaes

def predictRating(U, V, test):
    ratingsPred = []
    X = np.matmul(U, V)

    for i in range(test.shape[0]):
        userId = test['user_id'][i]
        movieId = test['movie_id'][i]

        ratingPredicted = X[userId][movieId]
        ratingsPred.append(round(ratingPredicted))

    return ratingsPred

def nmae(test, ratingsPred):
    inds = np.nonzero(ratingsPred)
    ratingsTrue = np.array(test['rating'])[inds]
    ratingsPred = np.array(ratingsPred)[inds]
    error = mae(ratingsTrue, ratingsPred)/4
    return error


In [28]:
nmae_folds_cg_reg = []
for f in range(1, 6):
    nmae_folds_cg_reg.append([0.0]*2)
    b, t = makeDataset(f)
    Y, R = makeYandRMatrix(b)
    for k in range(5, 11, 5):
        U, V = uandVInitialization(f = k)
        U, V, nmaes = train(Y, R, U, V, t, 100, method="als-cg")
        nmae_folds_cg_reg[f-1][(k//5)-1] = min(nmaes)
        print("Fold:", f, "Factors:", k, "NMAE:", min(nmaes))
    print()

nmae_folds_cg_reg = np.array(nmae_folds_cg_reg).T
for k in range(2):
    print("Average for k:", (k+1)*5, nmae_folds_cg_reg[k].mean())


Fold: 1 Factors: 5 NMAE: 0.17336853167851066
Fold: 1 Factors: 10 NMAE: 0.18215627347858754

Fold: 2 Factors: 5 NMAE: 0.17152690863579476
Fold: 2 Factors: 10 NMAE: 0.17887542559583416

Fold: 3 Factors: 5 NMAE: 0.17108358358358358
Fold: 3 Factors: 10 NMAE: 0.1775940564338603

Fold: 4 Factors: 5 NMAE: 0.17175696633148232
Fold: 4 Factors: 10 NMAE: 0.17879287928792878

Fold: 5 Factors: 5 NMAE: 0.17250737905848215
Fold: 5 Factors: 10 NMAE: 0.17917542279595716

Average for k: 5 0.17204867385757067
Average for k: 10 0.17931881151843357


In [29]:
nmae_folds_inv_reg = []
for f in range(1, 6):
    nmae_folds_inv_reg.append([0.0]*2)
    b, t = makeDataset(f)
    Y, R = makeYandRMatrix(b)
    for k in range(5, 11, 5):
        U, V = uandVInitialization(f = k)
        U, V, nmaes = train(Y, R, U, V, t, 100, method="als-inv")  
        nmae_folds_inv_reg[f-1][(k//5)-1] = min(nmaes)
        print("Fold:", f, "Factors:", k, "NMAE:", min(nmaes))
    print()

nmae_folds_inv_reg = np.array(nmae_folds_inv_reg).T
for k in range(2):
    print("Average for k:", (k+1)*5, nmae_folds_inv_reg[k].mean())


Fold: 1 Factors: 5 NMAE: 0.17616223790221688
Fold: 1 Factors: 10 NMAE: 0.18333083120652555

Fold: 2 Factors: 5 NMAE: 0.1724176759083175
Fold: 2 Factors: 10 NMAE: 0.18048670572329878

Fold: 3 Factors: 5 NMAE: 0.1693701812356063
Fold: 3 Factors: 10 NMAE: 0.176494871153365

Fold: 4 Factors: 5 NMAE: 0.17237368684342172
Fold: 4 Factors: 10 NMAE: 0.1779781858207835

Fold: 5 Factors: 5 NMAE: 0.1731962373661563
Fold: 5 Factors: 10 NMAE: 0.18036518259129564

Average for k: 5 0.17270400385114373
Average for k: 10 0.1797311552990537


In [33]:
nmae_folds_ds_reg = []
for f in range(1, 6):
    nmae_folds_ds_reg.append([0.0]*2)
    b, t = makeDataset(f)
    Y, R = makeYandRMatrix(b)
    for k in range(5, 11, 5):
        U, V = uandVInitialization(f = k)
        U, V, nmaes = train(Y, R, U, V, t, 100, method="als-ds")  
        nmae_folds_ds_reg[f-1][(k//5)-1] = min(nmaes)
        print("Fold:", f, "Factors:", k, "NMAE:", min(nmaes))
    print()

nmae_folds_ds_reg = np.array(nmae_folds_ds_reg).T
for k in range(2):
    print("Average for k:", (k+1)*5, nmae_folds_ds_reg[k].mean())

Fold: 1 Factors: 5 NMAE: 0.17420944661262883
Fold: 1 Factors: 10 NMAE: 0.18308042868589744

Fold: 2 Factors: 5 NMAE: 0.17284677015523284
Fold: 2 Factors: 10 NMAE: 0.1806848903574647

Fold: 3 Factors: 5 NMAE: 0.17138781842750614
Fold: 3 Factors: 10 NMAE: 0.1781602002503129

Fold: 4 Factors: 5 NMAE: 0.17091964375062543
Fold: 4 Factors: 10 NMAE: 0.17655593356013607

Fold: 5 Factors: 5 NMAE: 0.17257943457593194
Fold: 5 Factors: 10 NMAE: 0.17862681340670336

Average for k: 5 0.17238862270438504
Average for k: 10 0.1794216532521029
