In [204]:
import numpy as np
import pandas as pd
import os.path
from random import randint

# -*- coding: utf-8 -*-
"""
### NOTES
This file is an example of what your code should look like. It is written in Python 3.6.
To know more about the expectations, please refer to the guidelines.
"""

#####
##
# DATA IMPORT
##
#####

# Where data is located
movies_file = './data/movies.csv'
users_file = './data/users.csv'
ratings_file = './data/ratings.csv'
predictions_file = './data/predictions.csv'
submission_file = './data/submission.csv'

# Read the data using pandas
movies_description = pd.read_csv(movies_file, delimiter=';', dtype={'movieID': 'int', 'year': 'int', 'movie': 'str'},
                                 names=['movieID', 'year', 'movie'])
users_description = pd.read_csv(users_file, delimiter=';',
                                dtype={'userID': 'int', 'gender': 'str', 'age': 'int', 'profession': 'int'},
                                names=['userID', 'gender', 'age', 'profession'])
ratings_description = pd.read_csv(ratings_file, delimiter=';',
                                  dtype={'userID': 'int', 'movieID': 'int', 'rating': 'int'},
                                  names=['userID', 'movieID', 'rating'])
predictions_description = pd.read_csv(predictions_file, delimiter=';', names=['userID', 'movieID'], header=None)

utility_matrix: pd.DataFrame = \
        ratings_description.pivot(index='userID', columns='movieID', values='rating').T
utility_matrix

userID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,,,,4.0,,,,,3.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3702,,,,,,,,,3.0,4.0,...,,,,,,,,,,
3703,,,,,,,,,,,...,,,,,,,,,,
3704,,,,,,,,,,,...,,,,,,,,,,
3705,,,,,,,,,,,...,,,,,,,,,,


In [215]:
R = np.zeros((len(movies_description), len(users_description)))

for user, movie, rating in ratings_description.values:
    R[movie-1, user-1] = rating

In [216]:
R[R==0] = np.nan

In [219]:
R.shape

(3706, 6040)

In [229]:
learning_rate= 0.05
k = 2

np.random.seed(42)
Q = np.random.uniform(-1, 1, (utility_matrix.shape[0],k))
P = np.random.uniform(-1, 1, (k,utility_matrix.shape[1]))
div = (R.shape[0] * R.shape[1]) - np.isnan(R).sum()
RMSE = np.sqrt(((np.nan_to_num(R - np.matmul(Q, P), 0)**2).sum())/div)
print(f"Starting RMSE: {RMSE}")

for epoch in range(1000):
    R_pred = np.matmul(Q,P)
    curr_error = np.nan_to_num(R - R_pred, 0)
    Q_update = np.zeros(Q.shape)
    for i in range(len(Q_update)):
        for curr_k in range(k):
            Q_delta =(-2 * np.dot(P[curr_k, :], curr_error[i]))/np.isnan(R[i]).sum()
            Q_update[i, curr_k] = learning_rate * Q_delta

    P_update = np.zeros(P.shape)
    for i in range(P_update.shape[1]):
        for curr_k in range(k):
            P_delta =(-2 * np.dot(Q[:, curr_k], curr_error[:, i]))/np.isnan(R[:, i]).sum()
            P_update[curr_k, i] = learning_rate * P_delta

    Q -= Q_update
    P -= P_update


Starting RMSE: 3.781830053581588


In [230]:
RMSE = np.sqrt(((np.nan_to_num(R - np.matmul(Q, P), 0)**2).sum())/div)
print(f"Final RMSE: {RMSE}")

Final RMSE: 0.927614548906496


In [231]:
submission = []
for i, [user,movie] in enumerate(predictions_description.values):
    submission.append([i+1, R_pred[movie-1,user-1]])

submission

[[1, 3.474430655182154],
 [2, 2.261235162482041],
 [3, 3.5400326311644896],
 [4, 4.092255007716797],
 [5, 3.0139332735093665],
 [6, 1.6816272485662942],
 [7, 3.959154112699957],
 [8, 4.2130823613515345],
 [9, 3.3925228566412367],
 [10, 4.3078897450218125],
 [11, 3.3166649358805675],
 [12, 3.565994857536473],
 [13, 4.3615379620247055],
 [14, 3.8957509142983326],
 [15, 4.142966186473111],
 [16, 3.439074788988172],
 [17, 3.483602759267318],
 [18, 3.0771483214158786],
 [19, 4.611471104611722],
 [20, 2.2415808785834646],
 [21, 3.622777050194481],
 [22, 2.677308651906541],
 [23, 2.914018953752403],
 [24, 4.393791711263908],
 [25, 3.746138803667715],
 [26, 3.2588280235586637],
 [27, 3.0874350654244833],
 [28, 2.781265805261909],
 [29, 3.728958497472023],
 [30, 2.9520254874109475],
 [31, 3.4995861511926325],
 [32, 2.900912353748299],
 [33, 3.5285478760762996],
 [34, 3.4144675509132365],
 [35, 4.047040069350076],
 [36, 3.359856549473349],
 [37, 3.034477653091757],
 [38, 2.8170748090010065],
 [3

In [232]:
submission_df = pd.DataFrame(submission, columns=["Id", "# Rating"])
submission_df.to_csv("LF_submission.csv", index=False)