In [99]:
import pandas as pd
import numpy as np
import numpy.ma as ma
from sklearn.model_selection import KFold

There are 6040(max user id) users and 3952(max movie id) Movies in our database

Strategy
<ol>
    <li>Create a Utility Matrix(M) with the size of Users and Movies</li>
    <li>Update M with the ratings from training dataset</li>
    <li>Decide for d dimension (for U & V)</li>
    <li>Implement the decomposition algorithm</li>
    <li>Initialize U & V matrices</li>
    <li>Train and find optimum U & V matrices</li>
    <li>Evaluate on test set using RMSE values</li>
</ol>

<img src="./images/IMG_1387.jpeg" style="width: 75%; height: auto;">

In [27]:
# Reading the data
user_col_names = ["UserID", "Gender", "Age", "Occupation", "Zipcode"]
udf = pd.read_csv("ml-1m/users.dat", sep="::", header=None, names=user_col_names, engine="python")

movies_col_names = ["MovieID", "Title", "Genres"]
mdf = pd.read_csv("ml-1m/movies.dat", sep="::", header=None, names=movies_col_names, engine="python")

ratings_col_names = ["UserID", "MovieID", "Rating", "Timestamp"]
rdf = pd.read_csv("ml-1m/ratings.dat", sep="::", header=None, names=ratings_col_names, engine="python")

In [28]:
print(f"UserID: min = {udf.UserID.min()}, max = {udf.UserID.max()}")
print(f"MovieID: min = {mdf.MovieID.min()}, max = {mdf.MovieID.max()}")

UserID: min = 1, max = 6040
MovieID: min = 1, max = 3952


In [49]:
# Step 1 : Create Utility Matrix (M)
M = np.zeros((udf.UserID.max(), mdf.MovieID.max()))
M.shape

(6040, 3952)

In [22]:
# Dividing the data into 5 folds
RandomState = 42

kf = KFold(n_splits=5, shuffle=True, random_state=RandomState)
Folds = []
for train_index, test_index in kf.split(rdf):
    Folds.append((rdf.iloc[train_index, :], rdf.iloc[test_index, :]))

In [50]:
# Step 2 : Update M
def update_M(m, df):
    for index, row in df.iterrows():
        m[row["UserID"]-1, row["MovieID"]-1] = row["Rating"]
    return m

In [32]:
#for i, (train, test) in enumerate(Folds):
#    break

train, test = Folds[0]

In [51]:
%%time
M_updated = update_M(M, train)

CPU times: user 51.8 s, sys: 130 ms, total: 51.9 s
Wall time: 51.9 s


In [52]:
M_updated[123,123]

0.0

In [53]:
M_updated[123,123] == 0

True

In [58]:
np.nonzero(M_updated[0, :])

(array([   0,   47,  149,  259,  526,  530,  587,  593,  594,  660,  719,
         744,  782,  913,  918,  937, 1021, 1027, 1028, 1096, 1196, 1206,
        1269, 1286, 1544, 1565, 1720, 1906, 1960, 1961, 2027, 2293, 2339,
        2354, 2686, 2761, 2790, 2796, 2917, 3113, 3407]),)

In [224]:
# RMSE calculation
def calc_rmse(M, U, V, root=True):
    masked = ma.masked_array(M, mask=M==0)
    P = np.dot(U, V)     
    s = np.sum((masked-P)**2)
    if not root:
        return s
    return np.sqrt(s/masked.count())

def decompose_U(M, U, V):
    n, d = U.shape
    for r in range(n):
        for s in range(d):
            U[r,s]= Urs(M, U, V, r, s)
    return U

def Urs(M, U, V, r, s):
    M_slice = ma.masked_array(M, mask=M==0)[r, :]
    V_slice = ma.masked_array(V[s, :], M_slice.mask)
    sum_array=np.matmul(U[r,:],V[:])-(U[r,s]*V[s,:])
    numerator = np.sum(V[s,:]*(M_slice-sum_array))
    denominator = np.sum(np.square(V_slice))
    return numerator/denominator


def decompose_V(M, U, V):
    d, m = V.shape
    for s in range(m):
        for r in range(d):
            V[r,s]= Vrs(M, U, V, r, s)
    return V

def Vrs(M, U, V, r, s):
    M_slice = ma.masked_array(M, mask=M==0)[:, s]
    U_slice = ma.masked_array(U[:, r], M_slice.mask)
    sum_array=np.matmul(U[:],V[:, s])-(V[r,s]*U[:, r])
    numerator = np.sum(U[:, r]*(M_slice-sum_array))
    denominator = np.sum(np.square(U_slice))
    return numerator/denominator

In [225]:
M = np.array(
    [[5, 2, 4, 4, 3],
     [3, 1, 2, 4, 1],
     [2, 0, 3, 1, 4],
     [2, 5, 4, 3, 5],
     [4, 4, 5, 4, 0]])

U = np.ones((5, 2))
V = np.ones((2, 5))

In [226]:
decompose_U(M, U, V)

array([[2.6 , 1.  ],
       [1.2 , 1.  ],
       [1.5 , 1.  ],
       [2.8 , 1.  ],
       [3.25, 1.  ]])

In [227]:
decompose_V(M, U, V)

array([[0.93059527, 0.88525515, 1.15761199, 0.92712936, 1.14270093],
       [1.08754874, 0.8200592 , 0.97222078, 1.09541634, 0.93603062]])