Dependencies

In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from sklearn.metrics import mean_absolute_error as mae

In [2]:
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
def makeDataset(fold):
    base = pd.read_csv("/content/drive/MyDrive/CFAssignmentsDataset/u" + str(fold) + ".base", sep = "\t", names = columns, encoding = "latin-1")
    test = pd.read_csv("/content/drive/MyDrive/CFAssignmentsDataset/u" + str(fold) + ".test", sep = "\t", names = columns, encoding = "latin-1")
    
    return base, test

def makeUbyMMatrix(data):
    totalUsers = len(data['user_id'].unique())
    totalMovies = 1682
    train = np.zeros([totalUsers+1, totalMovies+1])

    for i in range(data.shape[0]):
        userId = data['user_id'][i]
        movieId = data['movie_id'][i]
        curRating = data['rating'][i]

        train[userId][movieId] = curRating 

    return train

# User Based Recommendation

In [171]:
def getUMean(uByMMatrix):
    avgMatrix = [0] * uByMMatrix.shape[0]

    for i in range(1, uByMMatrix.shape[0]):
        curRow = uByMMatrix[i]
        summ = np.sum(curRow)
        totalCnt = np.count_nonzero(curRow)
        avgMatrix[i] = summ/totalCnt

    return avgMatrix

def makeSimilarityMatrixUserBased(uByMMatrix):
    totalUsers = uByMMatrix.shape[0]
    simMatrix = np.zeros([totalUsers, totalUsers])
    for i in range(1, totalUsers):
        for j in range(i+1, totalUsers):
            A = uByMMatrix[i]
            B = uByMMatrix[j]
            cosineSim = np.dot(A, B)/(norm(A)*norm(B))
            simMatrix[i][j] = cosineSim
            simMatrix[j][i] = cosineSim

    return simMatrix


def makeSignificanceMatrixUserBased(uByMMatrix):
    totalUsers = uByMMatrix.shape[0]
    simMatrix = np.zeros([totalUsers, totalUsers])
    for i in range(1, totalUsers):
        for j in range(i+1, totalUsers):
            A = uByMMatrix[i]
            B = uByMMatrix[j]
            noOfCommonM = np.count_nonzero(A*B)
            cosineSim = np.dot(A, B)/(norm(A)*norm(B))
            if (noOfCommonM < 50):
                cosineSim /= 50

            simMatrix[i][j] = cosineSim
            simMatrix[j][i] = cosineSim

    return simMatrix

def makeVarianceMatrixUserBased(uByMMatrix, variance, s_var):
    totalUsers = uByMMatrix.shape[0]
    simMatrix = np.zeros([totalUsers, totalUsers])
    for i in range(1, totalUsers):
        for j in range(i+1, totalUsers):
            A = uByMMatrix[i]
            B = uByMMatrix[j]
            sim = sum(A*B*variance)
            sim = sim/s_var
            simMatrix[i][j] = sim
            simMatrix[j][i] = sim

    return simMatrix


def getVarianceUserBased(uByMMatrix):
    variance = np.array([0.0] * uByMMatrix.shape[1])
    mByUMatrix = uByMMatrix.transpose()
    for movie in range(1, uByMMatrix.shape[1]):
        varMovie = np.var(mByUMatrix[movie]) 
        variance[movie] = varMovie
    varMax = max(variance)
    varMin = min(variance)
    variance -= varMin
    variance = variance/(varMax - varMin)
    return variance


def getNeighboursUserBased(simMatrix, k):
    kNeighboursForAllUsers = []
    for user in range(1, simMatrix.shape[0]):
        a = simMatrix[user]
        ind = np.argpartition(a, -k-10)[-k-10:]
        topk = a[ind]
        kNeighboursForAllUsers.append(list(zip(ind, topk)))

    return kNeighboursForAllUsers

def predictRatingSimilarityWeightingUserBased(kNeighboursOfAUser, uByMMatrix, m_index, avgMatrix, u_index):
    num = 0
    dnm = 0
    for (nbr, sim) in kNeighboursOfAUser:
        nbrRating = uByMMatrix[nbr][m_index]
        if (nbrRating):
            num += (nbrRating - avgMatrix[nbr])*sim
            dnm += sim
    if dnm == 0: return 0
    return num/dnm + avgMatrix[u_index]


def predictRatingTestUserBased(test, train, typeW = "Sim", k = 10):
    ratingsPred = []
    avgMatrix = getUMean(train)
    
    if typeW == "Sim":
        simMatrix = makeSimilarityMatrixUserBased(train)
        kNeighboursForAllUsers = getNeighboursUserBased(simMatrix, k)
    
    elif typeW == "Sig":
        simMatrix = makeSignificanceMatrixUserBased(train)
        kNeighboursForAllUsers = getNeighboursUserBased(simMatrix, k)

    else:
        variance = getVarianceUserBased(train)
        simMatrix = makeVarianceMatrixUserBased(train, variance, sum(variance))
        kNeighboursForAllUsers = getNeighboursUserBased(simMatrix, k)

    for i in range(test.shape[0]):
        userId = test['user_id'][i]
        movieId = test['movie_id'][i]
        ratingPredicted = predictRatingSimilarityWeightingUserBased(kNeighboursForAllUsers[userId-1], train, movieId, avgMatrix, userId)
        ratingsPred.append(round(ratingPredicted))

    return ratingsPred

     


In [172]:
mae_folds_sim_user = []
for f in range(1, 6):
    mae_folds_sim_user.append([0.0]*5)
    b, t = makeDataset(f)
    train = makeUbyMMatrix(b)
    for k in range(10, 51, 10):
        pred = predictRatingTestUserBased(t, train, "Sim", k)
        error = mae(pred, t['rating'])
        mae_folds_sim_user[f-1][(k//10)-1] = error
        print("Fold:", f, "Neighbours:", k, "MAE:", error)
    print()

mae_folds_sim_user = np.array(mae_folds_sim_user).transpose()
for k in range(5):
    print("Average for k:", (k+1)*10, mae_folds_sim_user[k].mean())


Fold: 1 Neighbours: 10 MAE: 0.85225
Fold: 1 Neighbours: 20 MAE: 0.8043
Fold: 1 Neighbours: 30 MAE: 0.781
Fold: 1 Neighbours: 40 MAE: 0.7655
Fold: 1 Neighbours: 50 MAE: 0.75545

Fold: 2 Neighbours: 10 MAE: 0.8484
Fold: 2 Neighbours: 20 MAE: 0.7886
Fold: 2 Neighbours: 30 MAE: 0.76465
Fold: 2 Neighbours: 40 MAE: 0.7509
Fold: 2 Neighbours: 50 MAE: 0.7399

Fold: 3 Neighbours: 10 MAE: 0.84035
Fold: 3 Neighbours: 20 MAE: 0.7908
Fold: 3 Neighbours: 30 MAE: 0.76505
Fold: 3 Neighbours: 40 MAE: 0.749
Fold: 3 Neighbours: 50 MAE: 0.7389

Fold: 4 Neighbours: 10 MAE: 0.8275
Fold: 4 Neighbours: 20 MAE: 0.77855
Fold: 4 Neighbours: 30 MAE: 0.75915
Fold: 4 Neighbours: 40 MAE: 0.74585
Fold: 4 Neighbours: 50 MAE: 0.7365

Fold: 5 Neighbours: 10 MAE: 0.83905
Fold: 5 Neighbours: 20 MAE: 0.7879
Fold: 5 Neighbours: 30 MAE: 0.7632
Fold: 5 Neighbours: 40 MAE: 0.74945
Fold: 5 Neighbours: 50 MAE: 0.74045

Average for k: 10 0.8415100000000001
Average for k: 20 0.79003
Average for k: 30 0.76661
Average for k: 40 0.75

In [173]:
mae_folds_sig_user = []
for f in range(1, 6):
    mae_folds_sig_user.append([0.0]*5)
    b, t = makeDataset(f)
    train = makeUbyMMatrix(b)
    for k in range(10, 51, 10):
        pred = predictRatingTestUserBased(t, train, "Sig", k)
        error = mae(pred, t['rating'])
        mae_folds_sig_user[f-1][(k//10)-1] = error
        print("Fold:", f, "Neighbours:", k, "MAE:", error)
    print()


mae_folds_sig_user = np.array(mae_folds_sig_user).transpose()
for k in range(5):
    print("Average for k:", (k+1)*10, mae_folds_sig_user[k].mean())

Fold: 1 Neighbours: 10 MAE: 0.84615
Fold: 1 Neighbours: 20 MAE: 0.80575
Fold: 1 Neighbours: 30 MAE: 0.7841
Fold: 1 Neighbours: 40 MAE: 0.7704
Fold: 1 Neighbours: 50 MAE: 0.7636

Fold: 2 Neighbours: 10 MAE: 0.83875
Fold: 2 Neighbours: 20 MAE: 0.7908
Fold: 2 Neighbours: 30 MAE: 0.76925
Fold: 2 Neighbours: 40 MAE: 0.75565
Fold: 2 Neighbours: 50 MAE: 0.7475

Fold: 3 Neighbours: 10 MAE: 0.83945
Fold: 3 Neighbours: 20 MAE: 0.79515
Fold: 3 Neighbours: 30 MAE: 0.77025
Fold: 3 Neighbours: 40 MAE: 0.7559
Fold: 3 Neighbours: 50 MAE: 0.7467

Fold: 4 Neighbours: 10 MAE: 0.8312
Fold: 4 Neighbours: 20 MAE: 0.79025
Fold: 4 Neighbours: 30 MAE: 0.7725
Fold: 4 Neighbours: 40 MAE: 0.7606
Fold: 4 Neighbours: 50 MAE: 0.75195

Fold: 5 Neighbours: 10 MAE: 0.84045
Fold: 5 Neighbours: 20 MAE: 0.79225
Fold: 5 Neighbours: 30 MAE: 0.7707
Fold: 5 Neighbours: 40 MAE: 0.76075
Fold: 5 Neighbours: 50 MAE: 0.75115

Average for k: 10 0.8392
Average for k: 20 0.79484
Average for k: 30 0.77336
Average for k: 40 0.76066
Ave

In [174]:
mae_folds_var_user = []
for f in range(1, 6):
    mae_folds_var_user.append([0.0]*5)
    b, t = makeDataset(f)
    train = makeUbyMMatrix(b)
    for k in range(10, 51, 10):
        pred = predictRatingTestUserBased(t, train, "Var", k)
        error = mae(pred, t['rating'])
        mae_folds_var_user[f-1][(k//10)-1] = error
        print("Fold:", f, "Neighbours:", k, "MAE:", error)
    print()


mae_folds_var_user = np.array(mae_folds_var_user).transpose()
for k in range(5):
    print("Average for k:", (k+1)*10, mae_folds_var_user[k].mean())

Fold: 1 Neighbours: 10 MAE: 0.79635
Fold: 1 Neighbours: 20 MAE: 0.7683
Fold: 1 Neighbours: 30 MAE: 0.7584
Fold: 1 Neighbours: 40 MAE: 0.752
Fold: 1 Neighbours: 50 MAE: 0.7494

Fold: 2 Neighbours: 10 MAE: 0.7893
Fold: 2 Neighbours: 20 MAE: 0.7594
Fold: 2 Neighbours: 30 MAE: 0.74765
Fold: 2 Neighbours: 40 MAE: 0.73975
Fold: 2 Neighbours: 50 MAE: 0.7347

Fold: 3 Neighbours: 10 MAE: 0.8034
Fold: 3 Neighbours: 20 MAE: 0.775
Fold: 3 Neighbours: 30 MAE: 0.7589
Fold: 3 Neighbours: 40 MAE: 0.75425
Fold: 3 Neighbours: 50 MAE: 0.74695

Fold: 4 Neighbours: 10 MAE: 0.794
Fold: 4 Neighbours: 20 MAE: 0.7715
Fold: 4 Neighbours: 30 MAE: 0.75425
Fold: 4 Neighbours: 40 MAE: 0.74805
Fold: 4 Neighbours: 50 MAE: 0.7402

Fold: 5 Neighbours: 10 MAE: 0.79275
Fold: 5 Neighbours: 20 MAE: 0.77065
Fold: 5 Neighbours: 30 MAE: 0.7563
Fold: 5 Neighbours: 40 MAE: 0.7491
Fold: 5 Neighbours: 50 MAE: 0.7449

Average for k: 10 0.79516
Average for k: 20 0.7689699999999999
Average for k: 30 0.7551
Average for k: 40 0.74863


# Item Based Recommendation

In [7]:
def makeSimilarityMatrixItemBased(uByMMatrix):
    mByUMatrix = uByMMatrix.transpose()
    totalMovies = uByMMatrix.shape[1]
    simMatrix = np.zeros([totalMovies, totalMovies])
    for i in range(1, totalMovies):
        for j in range(i+1, totalMovies):
            A = mByUMatrix[i]
            B = mByUMatrix[j]
            if (norm(A) == 0 or norm(B) == 0):
                cosineSim = 0
            else:
                cosineSim = np.dot(A, B)/(norm(A)*norm(B))
            simMatrix[i][j] = cosineSim
            simMatrix[j][i] = cosineSim
    
    simMatrix[np.isnan(simMatrix)] = 0
    return simMatrix

def makeSignificanceMatrixItemBased(uByMMatrix):
    mByUMatrix = uByMMatrix.transpose()
    totalMovies = mByUMatrix.shape[0]
    simMatrix = np.zeros([totalMovies, totalMovies])
    for i in range(1, totalMovies):
        for j in range(i+1, totalMovies):
            A = mByUMatrix[i]
            B = mByUMatrix[j]
            noOfCommonU = np.count_nonzero(A*B)
            if (norm(A) == 0 or norm(B) == 0):
                cosineSim = 0
            else:
                cosineSim = np.dot(A, B)/(norm(A)*norm(B))
            if (noOfCommonU < 50):
                cosineSim /= 50

            simMatrix[i][j] = cosineSim
            simMatrix[j][i] = cosineSim

    return simMatrix


def getMMean(uByMMatrix):
    mByUMatrix = uByMMatrix.transpose()
    avgMatrix = [0] * mByUMatrix.shape[0]

    for i in range(1, mByUMatrix.shape[0]):
        curRow = mByUMatrix[i]
        summ = np.sum(curRow)
        totalCnt = np.count_nonzero(curRow)
        if totalCnt == 0: 
            avgMatrix[i] = 0
        else:
            avgMatrix[i] = summ/totalCnt

    return avgMatrix


def makeVarianceMatrixItemBased(uByMMatrix, variance, s_var):
    mByUMatrix = uByMMatrix.transpose()
    totalMovies = mByUMatrix.shape[0]
    simMatrix = np.zeros([totalMovies, totalMovies])
    for i in range(1, totalMovies):
        for j in range(i+1, totalMovies):
            A = mByUMatrix[i]
            B = mByUMatrix[j]
            sim = sum(A*B*variance)
            sim = sim/s_var
            simMatrix[i][j] = sim
            simMatrix[j][i] = sim

    return simMatrix


def getVarianceItemBased(uByMMatrix):
    variance = np.array([0.0] * uByMMatrix.shape[0])
    mByUMatrix = uByMMatrix.transpose()
    for user in range(1, mByUMatrix.shape[1]):
        varUser = np.var(uByMMatrix[user]) 
        variance[user] = varUser
    varMax = max(variance)
    varMin = min(variance)
    variance -= varMin
    variance = variance/(varMax - varMin)
    return variance


def getNeighboursItemBased(simMatrixItemBased, k):
    kNeighboursForAllMovies = []
    for movie in range(1, simMatrixItemBased.shape[0]):
        a = simMatrixItemBased[movie]
        ind = np.argpartition(a, -k-10)[-k-10:]
        topk = a[ind]
        kNeighboursForAllMovies.append(list(zip(ind, topk)))

    return kNeighboursForAllMovies


def predictRatingSimilarityWeightingItemBased(kNeighboursOfAMovie, uByMMatrix, m_index, avgMatrix, u_index):
    num = 0
    dnm = 0
    for (nbrM, sim) in kNeighboursOfAMovie:
        nbrMRating = uByMMatrix[u_index][nbrM]
        if (nbrMRating):
            num += (nbrMRating - avgMatrix[nbrM])*sim
            dnm += sim
    if dnm == 0: return 0
    return num/dnm + avgMatrix[m_index]

def makeFinalPredictionsItemBased(test, simMatrix, k, train):
    ratingsPred = []
    kNeighboursForAllMovies = getNeighboursItemBased(simMatrix, k)
    avgMatrix = getMMean(train)
    
    for i in range(test.shape[0]):
        userId = test['user_id'][i]
        movieId = test['movie_id'][i]
        ratingPredicted = predictRatingSimilarityWeightingItemBased(kNeighboursForAllMovies[movieId-1], train, movieId, avgMatrix, userId)
        ratingsPred.append(round(ratingPredicted))
        
    return ratingsPred

def makeFinalSimMatrixItemBased(train, typeW = "Sim"): 
    if typeW == "Sim":
        simMatrix = makeSimilarityMatrixItemBased(train)
        
    elif typeW == "Sig":
        simMatrix = makeSignificanceMatrixItemBased(train)
    
    else:
        variance = getVarianceItemBased(train)
        simMatrix = makeVarianceMatrixItemBased(train, variance, sum(variance))
    return simMatrix


    

In [10]:
mae_folds_sim_item = []
for f in range(1, 6):
    mae_folds_sim_item.append([0.0]*5)
    b, t = makeDataset(f)
    train = makeUbyMMatrix(b)
    simMatrix = makeFinalSimMatrixItemBased(train, "Sim")
    for k in range(10, 51, 10):
        pred = makeFinalPredictionsItemBased(t, simMatrix, k, train)
        error = mae(pred, t['rating'])
        mae_folds_sim_item[f-1][(k//10)-1] = error
        print("Fold:", f, "Neighbours:", k, "MAE:", error)
    print()

mae_folds_sim_item = np.array(mae_folds_sim_item).transpose()
for k in range(5):
    print("Average for k:", (k+1)*10, mae_folds_sim_item[k].mean())

Fold: 1 Neighbours: 10 MAE: 0.85965
Fold: 1 Neighbours: 20 MAE: 0.79185
Fold: 1 Neighbours: 30 MAE: 0.7545
Fold: 1 Neighbours: 40 MAE: 0.732
Fold: 1 Neighbours: 50 MAE: 0.7246

Fold: 2 Neighbours: 10 MAE: 0.8057
Fold: 2 Neighbours: 20 MAE: 0.7542
Fold: 2 Neighbours: 30 MAE: 0.73125
Fold: 2 Neighbours: 40 MAE: 0.71685
Fold: 2 Neighbours: 50 MAE: 0.70675

Fold: 3 Neighbours: 10 MAE: 0.8047
Fold: 3 Neighbours: 20 MAE: 0.75565
Fold: 3 Neighbours: 30 MAE: 0.7295
Fold: 3 Neighbours: 40 MAE: 0.716
Fold: 3 Neighbours: 50 MAE: 0.70705

Fold: 4 Neighbours: 10 MAE: 0.80465
Fold: 4 Neighbours: 20 MAE: 0.7575
Fold: 4 Neighbours: 30 MAE: 0.7304
Fold: 4 Neighbours: 40 MAE: 0.71935
Fold: 4 Neighbours: 50 MAE: 0.70995

Fold: 5 Neighbours: 10 MAE: 0.8437
Fold: 5 Neighbours: 20 MAE: 0.78175
Fold: 5 Neighbours: 30 MAE: 0.7488
Fold: 5 Neighbours: 40 MAE: 0.73015
Fold: 5 Neighbours: 50 MAE: 0.7177

Average for k: 10 0.8236800000000001
Average for k: 20 0.76819
Average for k: 30 0.73889
Average for k: 40 0.7

In [8]:
mae_folds_sig_item = []
for f in range(1, 6):
    mae_folds_sig_item.append([0.0]*5)
    b, t = makeDataset(f)
    train = makeUbyMMatrix(b)
    simMatrix = makeFinalSimMatrixItemBased(train, "Sig")
    for k in range(10, 51, 10):
        pred = makeFinalPredictionsItemBased(t, simMatrix, k, train)
        error = mae(pred, t['rating'])
        mae_folds_sig_item[f-1][(k//10)-1] = error
        print("Fold:", f, "Neighbours:", k, "MAE:", error)
    print()

mae_folds_sig_item = np.array(mae_folds_sig_item).transpose()
for k in range(5):
    print("Average for k:", (k+1)*10, mae_folds_sig_item[k].mean())

Fold: 1 Neighbours: 10 MAE: 0.86385
Fold: 1 Neighbours: 20 MAE: 0.80335
Fold: 1 Neighbours: 30 MAE: 0.76635
Fold: 1 Neighbours: 40 MAE: 0.74975
Fold: 1 Neighbours: 50 MAE: 0.74185

Fold: 2 Neighbours: 10 MAE: 0.81315
Fold: 2 Neighbours: 20 MAE: 0.76675
Fold: 2 Neighbours: 30 MAE: 0.74485
Fold: 2 Neighbours: 40 MAE: 0.7293
Fold: 2 Neighbours: 50 MAE: 0.72

Fold: 3 Neighbours: 10 MAE: 0.81015
Fold: 3 Neighbours: 20 MAE: 0.7638
Fold: 3 Neighbours: 30 MAE: 0.7429
Fold: 3 Neighbours: 40 MAE: 0.72805
Fold: 3 Neighbours: 50 MAE: 0.7188

Fold: 4 Neighbours: 10 MAE: 0.81595
Fold: 4 Neighbours: 20 MAE: 0.77075
Fold: 4 Neighbours: 30 MAE: 0.7452
Fold: 4 Neighbours: 40 MAE: 0.73465
Fold: 4 Neighbours: 50 MAE: 0.7247

Fold: 5 Neighbours: 10 MAE: 0.8482
Fold: 5 Neighbours: 20 MAE: 0.78805
Fold: 5 Neighbours: 30 MAE: 0.75965
Fold: 5 Neighbours: 40 MAE: 0.74435
Fold: 5 Neighbours: 50 MAE: 0.7319

Average for k: 10 0.83026
Average for k: 20 0.77854
Average for k: 30 0.7517900000000001
Average for k: 40

In [9]:
mae_folds_var_item = []
for f in range(1, 6):
    mae_folds_var_item.append([0.0]*5)
    b, t = makeDataset(f)
    train = makeUbyMMatrix(b)
    simMatrix = makeFinalSimMatrixItemBased(train, "Var")
    for k in range(10, 51, 10):
        pred = makeFinalPredictionsItemBased(t, simMatrix, k, train)
        error = mae(pred, t['rating'])
        mae_folds_var_item[f-1][(k//10)-1] = error
        print("Fold:", f, "Neighbours:", k, "MAE:", error)
    print()

mae_folds_var_item = np.array(mae_folds_var_item).transpose()
for k in range(5):
    print("Average for k:", (k+1)*10, mae_folds_var_item[k].mean())

Fold: 1 Neighbours: 10 MAE: 0.8254
Fold: 1 Neighbours: 20 MAE: 0.78905
Fold: 1 Neighbours: 30 MAE: 0.7708
Fold: 1 Neighbours: 40 MAE: 0.7577
Fold: 1 Neighbours: 50 MAE: 0.7522

Fold: 2 Neighbours: 10 MAE: 0.7841
Fold: 2 Neighbours: 20 MAE: 0.75355
Fold: 2 Neighbours: 30 MAE: 0.7355
Fold: 2 Neighbours: 40 MAE: 0.72985
Fold: 2 Neighbours: 50 MAE: 0.72455

Fold: 3 Neighbours: 10 MAE: 0.77795
Fold: 3 Neighbours: 20 MAE: 0.7511
Fold: 3 Neighbours: 30 MAE: 0.73925
Fold: 3 Neighbours: 40 MAE: 0.73265
Fold: 3 Neighbours: 50 MAE: 0.723

Fold: 4 Neighbours: 10 MAE: 0.79245
Fold: 4 Neighbours: 20 MAE: 0.7635
Fold: 4 Neighbours: 30 MAE: 0.7455
Fold: 4 Neighbours: 40 MAE: 0.738
Fold: 4 Neighbours: 50 MAE: 0.73095

Fold: 5 Neighbours: 10 MAE: 0.83055
Fold: 5 Neighbours: 20 MAE: 0.7907
Fold: 5 Neighbours: 30 MAE: 0.76805
Fold: 5 Neighbours: 40 MAE: 0.75085
Fold: 5 Neighbours: 50 MAE: 0.74005

Average for k: 10 0.8020900000000001
Average for k: 20 0.76958
Average for k: 30 0.7518199999999999
Average f