In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.metrics.pairwise import pairwise_distances

Collected data for training and testing, and then performed some pre-processing steps to convert training data into matrix form

In [4]:
def get_data():
    cols = np.arange(1,25,1)
    data_item = pd.read_csv(r'ml-100k\u.item',
                 sep= '|'  , names = cols, encoding = 'latin-1')
    data_item_map = data_item.iloc[:,:2]
    data_item_map.columns = ['movie_id','name']
    year = []
    name = []
    for movie in data_item_map['name']:
        for item in re.finditer('(.*)\(([0-9]{4,4})\)', movie):
            yea = item.group(2)
            nam = item.group(1)
            year.append(yea)
            name.append(nam)

    data_item_map['name'] = pd.Series(name)
    data_item_map['year'] = pd.Series(year)
    
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    data_train = pd.read_csv(r'ml-100k\ua.base',
                      sep = '\t', names = r_cols, encoding = 'latin-1')
    data_test = pd.read_csv(r'ml-100k\ua.test',
                      sep = '\t', names = r_cols, encoding = 'latin-1')
    
    return data_item_map, data_train, data_test

data_item_map, data_train, data_test = get_data()

data_train_matrix = data_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')
data_train_matrix.fillna(value = 0, inplace = True)

This mf() function uses matrix factorisation technique to obtain the missing ratings of the movies, used gradient descent to 
minimize the error.

In [5]:
def mf(R,K,alpha, threshold, iterations, beta=None):
    R = np.array(R)
    n_users, n_items = R.shape
    P = np.random.rand(n_users,K)
    Q = np.random.rand(n_items, K)
    #Qt = Q.T
    
    for iteration in range(iterations):
        error_acc = np.array([])
        for i in range(n_users):
            for j in range(n_items):
                if R[i,j] > 0:
                    actual = R[i,j]
                    pred = np.dot(P[i,:],Q.T[:,j])
                    error = actual - pred
                    #print(error)
                    error_acc = np.append(error_acc,(error)**2)
                    #print(error_acc)
                    #SSE = error**2
                    for k in range(K):
                        P[i,k] = P[i,k] + alpha*(2*(error)*Q[j,k])
                        #print(P[i,k])
                        Q[j,k] = Q[j,k] + alpha*(2*(error)*P[i,k])
                        #print(Q[j,k])
        RMSE = np.sqrt(np.sum(error_acc)/len(error_acc))                
        print('length :', len(error_acc))
        print('error :', RMSE)
        print('Iteration no. :', iteration)
        if np.sum(error_acc) <= threshold:
            break
            
    return P, Q

P, Q = mf(R = data_train_matrix, K = 20, alpha = 0.001, iterations = 20, threshold = 0.1)
            
data_train_matrix_pred = np.dot(P,Q.T)                        
                       
user_similarity = pairwise_distances(data_train_matrix_pred, metric = 'cosine')
item_similarity = pairwise_distances(data_train_matrix_pred.T, metric = 'cosine')

length : 90570
error : 1.3720416930916772
Iteration no. : 0
length : 90570
error : 1.120003520718238
Iteration no. : 1
length : 90570
error : 1.0541316368273626
Iteration no. : 2
length : 90570
error : 1.019171534950153
Iteration no. : 3
length : 90570
error : 0.9958782469821701
Iteration no. : 4
length : 90570
error : 0.9786807452734541
Iteration no. : 5
length : 90570
error : 0.9652201186120118
Iteration no. : 6
length : 90570
error : 0.9542544294500892
Iteration no. : 7
length : 90570
error : 0.9450485903378487
Iteration no. : 8
length : 90570
error : 0.9371334550555106
Iteration no. : 9
length : 90570
error : 0.9301926880598483
Iteration no. : 10
length : 90570
error : 0.9240036851673086
Iteration no. : 11
length : 90570
error : 0.9184043373766309
Iteration no. : 12
length : 90570
error : 0.9132732310892628
Iteration no. : 13
length : 90570
error : 0.9085172918776768
Iteration no. : 14
length : 90570
error : 0.9040637692128561
Iteration no. : 15
length : 90570
error : 0.89985486474

It predicts the ratings based on the user-similarity or item-similarity whichever we specify.

In [6]:
def pred(data_matrix, similarity, type = None):
    data_matrix = np.array(data_matrix)
    if type == 'user':
        mean_user_rat = data_matrix.mean(axis = 1)
        ratings_diff = data_matrix - mean_user_rat[:,np.newaxis]
        pred = mean_user_rat[:,np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis = 1)]).T
        
    elif type == 'item':
        pred = data_matrix.dot(similarity)/np.array([np.abs(similarity).sum(axis = 1)])
        #pred = similarity.dot(data_matrix)/np.array([np.abs(similarity).sum(axis = 1)])
        
    return pred

pred = pred(data_matrix =data_train_matrix_pred, similarity =user_similarity, type = 'user'  )

Testing the predictions made and calculating the RMSE for test data

In [9]:
def pred_test(data_test, pred):
    data_test_pred = data_test.copy()
    data_test_pred['rating_pred'] = np.nan
    for i in range(len(data_test)):
        #print(i)
        m = data_test_pred['user_id'].loc[i]
        n = data_test_pred['movie_id'].loc[i]
        data_test_pred.loc[i,'rating_pred'] = pred[m-1,n-1]
    
    data_test_pred['error_sqr'] = (data_test_pred['rating'] - data_test_pred['rating_pred'])**2
    RMSE_test = np.sqrt(data_test_pred['error_sqr'].sum()/len(data_test_pred))
    
    return RMSE_test

RMSE_test = pred_test(data_test = data_test, pred = pred)
print('RMSE for test dataset :',RMSE_test)

RMSE for test dataset : 0.9653940422131255
