In [204]:
import numpy as np
import pandas as pd
import os.path
from random import randint

# -*- coding: utf-8 -*-
"""
### NOTES
This file is an example of what your code should look like. It is written in Python 3.6.
To know more about the expectations, please refer to the guidelines.
"""

#####
##
# DATA IMPORT
##
#####

# Where data is located
movies_file = './data/movies.csv'
users_file = './data/users.csv'
ratings_file = './data/ratings.csv'
predictions_file = './data/predictions.csv'
submission_file = './data/submission.csv'

# Read the data using pandas
movies_description = pd.read_csv(movies_file, delimiter=';', dtype={'movieID': 'int', 'year': 'int', 'movie': 'str'},
                                 names=['movieID', 'year', 'movie'])
users_description = pd.read_csv(users_file, delimiter=';',
                                dtype={'userID': 'int', 'gender': 'str', 'age': 'int', 'profession': 'int'},
                                names=['userID', 'gender', 'age', 'profession'])
ratings_description = pd.read_csv(ratings_file, delimiter=';',
                                  dtype={'userID': 'int', 'movieID': 'int', 'rating': 'int'},
                                  names=['userID', 'movieID', 'rating'])
predictions_description = pd.read_csv(predictions_file, delimiter=';', names=['userID', 'movieID'], header=None)

utility_matrix: pd.DataFrame = \
        ratings_description.pivot(index='userID', columns='movieID', values='rating').T
utility_matrix

userID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,,,,4.0,,,,,3.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3702,,,,,,,,,3.0,4.0,...,,,,,,,,,,
3703,,,,,,,,,,,...,,,,,,,,,,
3704,,,,,,,,,,,...,,,,,,,,,,
3705,,,,,,,,,,,...,,,,,,,,,,


In [215]:
R = np.zeros((len(movies_description), len(users_description)))

for user, movie, rating in ratings_description.values:
    R[movie-1, user-1] = rating

In [216]:
R[R==0] = np.nan

In [219]:
R.shape

(3706, 6040)

In [221]:
learning_rate= 0.05
k = 2

np.random.seed(42)
Q = np.random.uniform(-1, 1, (utility_matrix.shape[0],k))
P = np.random.uniform(-1, 1, (k,utility_matrix.shape[1]))
div = (R.shape[0] * R.shape[1]) - np.isnan(R).sum()
RMSE = np.sqrt(((np.nan_to_num(R - np.matmul(Q, P), 0)**2).sum())/div)
print(f"Starting RMSE: {RMSE}")

for epoch in range(10):
    R_pred = np.matmul(Q,P)
    curr_error = np.nan_to_num(R - R_pred, 0)
    Q_update = np.zeros(Q.shape)
    for i in range(len(Q_update)):
        for curr_k in range(k):
            Q_delta =(-2 * np.dot(P[curr_k, :], curr_error[i]))/np.isnan(R[i]).sum()
            Q_update[i, curr_k] = learning_rate * Q_delta

    P_update = np.zeros(P.shape)
    for i in range(P_update.shape[1]):
        for curr_k in range(k):
            P_delta =(-2 * np.dot(Q[:, curr_k], curr_error[:, i]))/np.isnan(R[:, i]).sum()
            P_update[curr_k, i] = learning_rate * P_delta

    Q -= Q_update
    P -= P_update


Starting RMSE: 3.781830053581588


In [222]:
RMSE = np.sqrt(((np.nan_to_num(R - np.matmul(Q, P), 0)**2).sum())/div)
print(f"Final RMSE: {RMSE}")

Final RMSE: 3.775745342756671


In [223]:
R_pred

array([[ 0.73365133, -0.66281527, -0.11692458, ..., -0.26501398,
         0.54735318,  0.69163599],
       [-0.06534284,  0.16051343, -0.04002651, ..., -0.37647993,
         0.41744592,  0.26929327],
       [-0.19836117,  0.00476418,  0.1183212 , ...,  0.75940142,
        -0.94938729, -0.75581195],
       ...,
       [ 0.57291186, -0.39325019, -0.15311285, ..., -0.69718268,
         0.99867209,  0.94555433],
       [ 0.46851596, -0.54273611, -0.01529312, ...,  0.30171777,
        -0.19923797,  0.05217298],
       [-0.00745186, -0.03171862,  0.02029968, ...,  0.15428482,
        -0.18220289, -0.13240198]])

In [224]:
R_pred.shape

(3706, 6040)

In [225]:
submission = []
for i, [user,movie] in enumerate(predictions_description.values):
    submission.append([i+1, R_pred[movie-1,user-1]])

In [226]:
submission

[[1, 0.0043234742904606095],
 [2, 0.3214321778917072],
 [3, -0.06449557735904511],
 [4, 0.40805532823111784],
 [5, 0.0033402761788050724],
 [6, -0.49313168009469566],
 [7, -0.13796486629487967],
 [8, -0.7001734418222764],
 [9, -0.2421598194137694],
 [10, 0.1420860581742994],
 [11, 0.08441472587790153],
 [12, 0.5628016343721478],
 [13, -0.42253695742032077],
 [14, -0.2308926220696341],
 [15, -0.8581348778172305],
 [16, 0.3582966208733499],
 [17, 0.4114791855031742],
 [18, 0.49988694801650624],
 [19, 0.09207600324369884],
 [20, -0.8873259223046062],
 [21, -0.024568452386337168],
 [22, 0.07589038316191243],
 [23, 0.22388886988085654],
 [24, 0.35192437799946097],
 [25, -0.013334310497695678],
 [26, 0.2625319113273961],
 [27, 0.0004755907271325878],
 [28, 0.3790223275690434],
 [29, 0.3066294233048249],
 [30, 0.3244953125977016],
 [31, -0.06319745548631917],
 [32, -0.2619589638928005],
 [33, 0.06472436095690635],
 [34, 0.13421257447958784],
 [35, -0.6565936998333766],
 [36, 0.097703059642348

In [227]:
submission_df = pd.DataFrame(submission, columns=["Id", "# Rating"])
submission_df

Unnamed: 0,Id,# Rating
0,1,0.004323
1,2,0.321432
2,3,-0.064496
3,4,0.408055
4,5,0.003340
...,...,...
90014,90015,-0.414969
90015,90016,-0.447356
90016,90017,0.173627
90017,90018,0.616004


In [228]:
submission_df.to_csv("LF_submission.csv", index=False)