In [1]:
import numpy as np
import pandas as pd
import os.path
import random
from random import randint
from random import uniform

In [2]:
#####
##
## DATA IMPORT
##
#####

# -*- coding: utf-8 -*-
"""
### NOTES
This file is an example of what your code should look like. It is written in Python 3.6.
To know more about the expectations, please refer to the guidelines.
"""

#####
##
## DATA IMPORT
##
#####

# Where data is located
movies_file = '../data/movies.csv'
users_file = '../data/users.csv'
ratings_file = '../data/ratings.csv'
predictions_file = '../data/predictions.csv'
submission_file = '../data/submission.csv'

# movies_file = r'/prediction/data/movies.csv'
# users_file = '/prediction/data/users.csv'
# ratings_file = '/prediction/data/ratings.csv'
# predictions_file = '/prediction/data/predictions.csv'
# submission_file = '/data/submission.csv'

# Read the data using pandas
movies_description = pd.read_csv(movies_file, delimiter=';',
                                 dtype={'movieID': 'int', 'year': 'int', 'movie': 'str'},
                                 names=['movieID', 'year', 'movie'])
users_description = pd.read_csv(users_file, delimiter=';',
                                dtype={'userID': 'int', 'gender': 'str', 'age': 'int', 'profession': 'int'},
                                names=['userID', 'gender', 'age', 'profession'])
ratings_description = pd.read_csv(ratings_file, delimiter=';',
                                  dtype={'userID': 'int', 'movieID': 'int', 'rating': 'int'},
                                  names=['userID', 'movieID', 'rating'])
predictions_description = pd.read_csv(predictions_file, delimiter=';', names=['userID', 'movieID'], header=None)


In [8]:
#####
##
## LATENT FACTORS
##
#####

def predict_latent_factors(movies, users, ratings, predictions):
    ## TO COMPLETE

    # Processing predictions data in order to return it from this function
    predictions_ratings = []

    utility_matrix_none = ratings.pivot_table(index='userID', columns='movieID', values='rating',
                                              fill_value=None)

    # Add columns to the utility matrix for movies that are never rated
    cols = utility_matrix_none.columns
    for i in movies['movieID'].values:
        if i not in cols:
            utility_matrix_none[i] = np.nan

    utility_matrix_none.fillna(0, inplace=True)
    rating_numpy = utility_matrix_none.values

    ##########################
    #                        #
    # ALGORITHM STARTS HERE  #
    #                        #
    ##########################

    # Doing Matrix factorization Q * PT
    U, S, VT = np.linalg.svd(rating_numpy, full_matrices=False)

    print("U : ", len(U), " ", len(U[0]))
    print("S : ", len(S), " ", len(S))
    print("VT : ", len(VT), " ", len(VT[0]))
    Q = U
    S_diagonal = np.diag(S)
    P = S_diagonal.dot(VT)
    print("P : ", len(P), " ", len(P[0]))

    # Mean of the ratings
    mean_all_ratings = ratings['rating'].mean()

    # Predicting rating
    for i, user_movie in predictions.iterrows():
        if i % 100 == 0:
            print(i, "/", len(predictions))
            
        user = predictions.iloc[i][0]
        movie = predictions.iloc[i][1]

        qi = Q[user - 1, :]
        px = P[:, movie - 1]

        # Calculating global effects

        user_rating = utility_matrix_none.loc[user].values
        movie_rating = utility_matrix_none[movie].values

        mean_user_rating = np.nanmean(user_rating)
        mean_movie_rating = np.nanmean(movie_rating)

        b_x = mean_user_rating - mean_all_ratings
        b_i = mean_movie_rating - mean_all_ratings

        baseline = mean_all_ratings + b_i + b_x
        
        pred = np.dot(qi, px.T)
            
#         print(" ")
#         print("Prediction : ", pred)
#         print("Baseline : ", baseline)
#         print("qi * px : ", qi.dot(px.T))
#         print(" ")
        predictions_ratings.append((i + 1, pred))

    return predictions_ratings

In [9]:
preds_latent_factors = predict_latent_factors(movies_description, users_description, ratings_description, predictions_description)
predictions_latent_factors = pd.DataFrame(preds_latent_factors, columns=['Id', 'Rating'])
predictions_latent_factors.to_csv('submission_latent_factors.csv', index=False)

U :  6040   3706
S :  3706   3706
VT :  3706   3706
P :  3706   3706
0 / 90019
100 / 90019
200 / 90019
300 / 90019
400 / 90019
500 / 90019
600 / 90019
700 / 90019
800 / 90019
900 / 90019
1000 / 90019
1100 / 90019
1200 / 90019
1300 / 90019
1400 / 90019
1500 / 90019
1600 / 90019
1700 / 90019
1800 / 90019
1900 / 90019
2000 / 90019
2100 / 90019
2200 / 90019
2300 / 90019
2400 / 90019
2500 / 90019
2600 / 90019
2700 / 90019
2800 / 90019
2900 / 90019
3000 / 90019
3100 / 90019
3200 / 90019
3300 / 90019
3400 / 90019
3500 / 90019
3600 / 90019
3700 / 90019
3800 / 90019
3900 / 90019
4000 / 90019
4100 / 90019
4200 / 90019
4300 / 90019
4400 / 90019
4500 / 90019
4600 / 90019
4700 / 90019
4800 / 90019
4900 / 90019
5000 / 90019
5100 / 90019
5200 / 90019
5300 / 90019
5400 / 90019
5500 / 90019
5600 / 90019
5700 / 90019
5800 / 90019
5900 / 90019
6000 / 90019
6100 / 90019
6200 / 90019
6300 / 90019
6400 / 90019
6500 / 90019
6600 / 90019
6700 / 90019
6800 / 90019
6900 / 90019
7000 / 90019
7100 / 90019
7200 / 

58900 / 90019
59000 / 90019
59100 / 90019
59200 / 90019
59300 / 90019
59400 / 90019
59500 / 90019
59600 / 90019
59700 / 90019
59800 / 90019
59900 / 90019
60000 / 90019
60100 / 90019
60200 / 90019
60300 / 90019
60400 / 90019
60500 / 90019
60600 / 90019
60700 / 90019
60800 / 90019
60900 / 90019
61000 / 90019
61100 / 90019
61200 / 90019
61300 / 90019
61400 / 90019
61500 / 90019
61600 / 90019
61700 / 90019
61800 / 90019
61900 / 90019
62000 / 90019
62100 / 90019
62200 / 90019
62300 / 90019
62400 / 90019
62500 / 90019
62600 / 90019
62700 / 90019
62800 / 90019
62900 / 90019
63000 / 90019
63100 / 90019
63200 / 90019
63300 / 90019
63400 / 90019
63500 / 90019
63600 / 90019
63700 / 90019
63800 / 90019
63900 / 90019
64000 / 90019
64100 / 90019
64200 / 90019
64300 / 90019
64400 / 90019
64500 / 90019
64600 / 90019
64700 / 90019
64800 / 90019
64900 / 90019
65000 / 90019
65100 / 90019
65200 / 90019
65300 / 90019
65400 / 90019
65500 / 90019
65600 / 90019
65700 / 90019
65800 / 90019
65900 / 90019
66000 