# Recommender systems

- Implementing collaborative filtering and latent factors


In [1]:
import numpy as np
import pandas as pd
import os.path
import random
from random import randint
from random import uniform
print(np.version.version)

1.18.5


In [2]:
# -*- coding: utf-8 -*-
"""
### NOTES
This file is an example of what your code should look like. It is written in Python 3.6.
To know more about the expectations, please refer to the guidelines.
"""

#####
##
## DATA IMPORT
##
#####

#Where data is located
movies_file = '../data/movies.csv'
users_file = '../data/users.csv'
ratings_file = '../data/ratings.csv'
predictions_file = '../data/predictions.csv'
submission_file = '../data/submission.csv'

# movies_file = r'/prediction/data/movies.csv'
# users_file = '/prediction/data/users.csv'
# ratings_file = '/prediction/data/ratings.csv'
# predictions_file = '/prediction/data/predictions.csv'
# submission_file = '/data/submission.csv'

# Read the data using pandas
movies_description = pd.read_csv(movies_file, delimiter=';', 
                                 dtype={'movieID':'int', 'year':'int', 'movie':'str'}, names=['movieID', 'year', 'movie'])
users_description = pd.read_csv(users_file, delimiter=';', 
                                dtype={'userID':'int', 'gender':'str', 'age':'int', 'profession':'int'}, names=['userID', 'gender', 'age', 'profession'])
ratings_description = pd.read_csv(ratings_file, delimiter=';', 
                                  dtype={'userID':'int', 'movieID':'int', 'rating':'float64'}, names=['userID', 'movieID', 'rating'])
predictions_description = pd.read_csv(predictions_file, delimiter=';', 
                                      dtype={'userID':'int', 'movieID':'int'}, names=['userID', 'movieID'], header=None)

In [None]:
# THIS IS USELESS, THIS IS JUST TO SAVE MY CUSTOM PEARSON CORRELATION

def pearson(r1, r2, min_len):
     
    # Removing all zero elements from both arrays
    r1zero = np.where(r1 == 0)[0]
    r2zero = np.where(r2 == 0)[0]
    r_1 = np.delete(r1, r2zero)
    r_2 = np.delete(r2, r1zero)
    r_1 = r_1[r_1 != 0]
    r_2 = r_2[r_2 != 0]
#     print("RESULTS SHOULD BE: ", stats.pearsonr(r_1, r_2))


    # test if arrays only have a few elements in common that aren't 0
    if len(r_1) < min_len:
        return None
    
    top = np.sum((r_1 - np.mean(r_1)) * (r_2 - np.mean(r_2)))
    botleft = np.sqrt(np.sum(np.square(r_1 - np.mean(r_1))))
    botright = np.sqrt(np.sum(np.square(r_2 - np.mean(r_2))))
    if botleft * botright == 0:
        return None
    res = top / (botleft * botright)
    return res

In [9]:
#####
##
## COLLABORATIVE FILTERING
##
#####

# minimal elements to have a rating on for two movies to be considered a neighbour. 
# Otherwise a movie with one rating and rest all zeroes is a good neighbour to all movies with that rating by that one user

def predict_collaborative_filtering(movies, users, ratings, predictions, neighbours, min_periods, print_output = False):
    
    predictions_ratings = []
    
    
#     Creating utility matrix 'u' : User x Movie -> Rating 
    utility_matrix = ratings.pivot_table(index='movieID', columns='userID', values='rating',
                                         fill_value=0)
    utility_matrix_none = ratings.pivot_table(index='userID', columns='movieID', values='rating',
                                         fill_value=None)
 
    # Add columns to the utility matrix for movies that are never rated
    cols = utility_matrix_none.columns
    for i in movies['movieID'].values:
        if i not in cols:
            utility_matrix_none[str(i)] = np.nan

        
    corr = utility_matrix_none.corr(min_periods=min_periods)
    
    # I don't know why, but somehow saving this in a csv and loading it back up again fixes some errors
    corr.to_csv(r'tempcorr.csv')
    corr = pd.read_csv(r'tempcorr.csv')

    if print_output:
        print("\n>>>UTILITY MATRIX\n")
        print(utility_matrix_none)
        print("\n>>>CORR MATRIX\n")
        print(corr)
        print("\n>>>TO PREDICT")
        print(predictions)
        print("\n\n>>>STARTING PREDICTION \n\n")
    
    # For every prediction to make (item/item, or movie/movie in this case)
    for i in range(len(predictions)):
#         if i % 100 == 0:
#             print(i, "/", len(predictions))
        user = predictions.iloc[i][0]
        movie = predictions.iloc[i][1]
          
        
        c = corr[['movieID', str(movie)]]
        
        # Sort the pearson correlation for all movies to the current movie to predict
        sorted_pearson = c.sort_values(by = [str(movie)], axis = 0, ascending = False)
        
        # Delete the movie itself, it should not be checked
        sorted_pearson = sorted_pearson[sorted_pearson.movieID != movie]
        
        # Get the movie id's of the sorted movies
        sorted_movies = sorted_pearson['movieID'].values
        sorted_corr = sorted_pearson[str(movie)].values
        
        # Add a certain amount of neirest neighbours, this amount is specified by the n_neighbours variable
        relevant_ratings = []
        for m in range(0, len(sorted_movies)):
            mov = sorted_movies[m]
            rating = utility_matrix_none.at[user, mov]
            if not np.isnan(rating):
                relevant_ratings.append((rating, sorted_corr[m]))
                if len(relevant_ratings) == neighbours:
                    break
        
        relevant_ratings = np.array(relevant_ratings)
        
        total_weight = np.sum(relevant_ratings, axis = 0)[1]
        pred = 0
        for j in range(len(relevant_ratings)):
            pred += relevant_ratings[j, 0] * relevant_ratings[j, 1] / total_weight
        
        # If the rating can't be calculated, set it to 3 as average
        if np.isnan(pred) or pred == 0:
            pred = 2.5
        
        if print_output:
            print("\n>>>>>>>>>>>>STARTING PREDICTION NUMBER", i + 1, "\nUser:", user, "\nMovie:", movie, "\n")
            print("\n>>SORTED PEARSON CORRELATION MATRIX\n")
            print(sorted_pearson)
            print("\n>>RELEVANT RATINGS AND THEIR WEIGHTS\n")
            print(relevant_ratings)
            print("\n>>FINAL PREDICTION: ", pred)
        predictions_ratings.append((i + 1, pred))
    return predictions_ratings

In [6]:
# Predict the submission and put it in csv
min_elements_non_zero = 5
n_neighbours = 5

predictions = predict_collaborative_filtering(movies_description,
                                                     users_description, ratings_description, predictions_description.head, n_neighbours, min_elements_non_zero)
print(predictions)
predictions_df = pd.DataFrame(predictions, columns = ['Id', 'Rating'])
predictions_df.to_csv('submission.csv', index=False)

      movieID      1914
1533     1539  1.000000
379       380  1.000000
1905     1914  1.000000
1968     1977  0.975665
3161     3172  0.973329
...       ...       ...
3701     1642       NaN
3702     1645       NaN
3703     2395       NaN
3704     3153       NaN
3705     3226       NaN

[3706 rows x 2 columns]
      movieID      1914
1533     1539  1.000000
379       380  1.000000
1968     1977  0.975665
3161     3172  0.973329
177       178  0.968330
...       ...       ...
3701     1642       NaN
3702     1645       NaN
3703     2395       NaN
3704     3153       NaN
3705     3226       NaN

[3705 rows x 2 columns]
      movieID      2124
2115     2124  1.000000
3682     3694  1.000000
3052     3062  0.988212
3107     3117  0.968246
7           8  0.968246
...       ...       ...
3701     1642       NaN
3702     1645       NaN
3703     2395       NaN
3704     3153       NaN
3705     3226       NaN

[3706 rows x 2 columns]
      movieID      2124
3682     3694  1.000000
3052     3062

In [None]:
#delete import later
from datetime import datetime

# Creates random test prediction scores (so we can test our RMSE)
def create_random_tests(ratings, amount, seed):
    if seed != -1:
        random.seed(seed)
    predictions = []
    solutions = []
    for i in range(amount):
        r = random.randint(0, 910189)
        predictions.append(ratings.loc[r][0:2])
        solutions.append(ratings.loc[r][2])
    predictions_df = pd.DataFrame(predictions, columns = ['userID', 'movieID'], dtype = 'int')
    
    return (predictions_df, solutions)


# Calculates root mean squared error
def rmse(pred, sol):
    return np.sqrt(((np.array(pred) - np.array(sol)) ** 2).mean())

# Runs the create tests method, and then executes these tests
def run_tests(amount, seed = -1, neighbours = [5], min_periods = [5]):
    (random_test_predictions, random_test_solutions) = create_random_tests(ratings_description, amount, seed = seed)
    test_results = []
    
    total_tests = len(neighbours) * len(min_periods)
    curr_test = 1
    # For each pair of neighbour/min_period, get the rmse of the result
    for n in neighbours:
        for p in min_periods:
            predictions = predict_collaborative_filtering(movies_description,
                                                     users_description, ratings_description, random_test_predictions, n, p)
            predictions = [x[1] for x in predictions]
            test_results.append((n, p, seed, rmse(predictions, random_test_solutions)))
            
            #print the time, so we know how long it takes, delete later
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            
            print("Done with test", curr_test, "/", total_tests, " Time = ", current_time)
            curr_test += 1
            
    return pd.DataFrame(test_results, columns = ['n_neighbours', 'min_periods', 'seed', 'rmse'], dtype = 'int')


neighbours_to_test = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
min_periods_to_test = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
test_results = run_tests(100, seed = 42, neighbours = neighbours_to_test, min_periods = min_periods_to_test)
test_results.to_csv('test_results.csv', index=False)

Done with test 1 / 169  Time =  09:42:01


In [None]:
#     '''
#     Creating matrix for cosine similarity
#     '''
#     r = ratings \
#     .groupby('movieID', as_index=False, sort=False) \
#     .mean() \
#     .rename(columns={'movieID': 'movieID', 'rating' : 'mean_rating'})
#     r.drop('userID', axis=1, inplace=True)
    
#     new_r = ratings.merge(r, how='left', on='movieID', sort=False)
#     new_r['centered_cosine'] = new_r['rating'] - new_r['mean_rating']
    
#     centered_cosine = new_r \
#     .pivot_table(index='movieID', columns='userID', values='centered_cosine') \
#     .fillna(0)
    
    
#     all_movies_numpy = centered_cosine.values
#     for i, row in centered_cosine.iterrows():
#         if(i in range_missing):
#             all_movies_numpy = np.vstack([all_movies_numpy, row.values])
            
            
#     '''
#     Cosine similarity - find similar users for a certain user based on |N|,
#     also making a prediction with Pearson correlation
#     '''
#     for i, user_movie in predictions.iterrows():
#         print("CURRENT MOVIE : ", user_movie['movieID'])
#         current_movie = all_movies_numpy[user_movie['movieID'] - 1]
#         current_rating = original_rating[user_movie['movieID'] - 1][user_movie['userID'] - 1]
#         if(current_rating > 0):
#              predictions_ratings.at[i, 'Rating'] = current_rating
#              continue
        
#         current_denominator = np.sqrt(sum([np.square(x) for x in current_movie]))
#         top_N_similar_movies = []
        
#         # Computing similarities to current movie that we want to predict for particular user
#         for id_movie, movie in enumerate(all_movies_numpy):
            
#             numerator = [x*y for x, y in zip(current_movie, movie)]
#             other_denominator = np.sqrt(sum([np.square(x) for x in movie]))
#             costheta = sum(numerator) / (current_denominator * other_denominator)
#             top_N_similar_movies.append((id_movie + 1, costheta))
            
#         # Get N similar items
#         top_N_similar_movies.sort(key=lambda pair: pair[1], reverse=True)
#         similar_movies = top_N_similar_movies[0:5]
#         print("PAIR : ", "first element =" , similar_movies[0][0], "second element =", similar_movies[0][1])
        
#         #Predicting the rating with Pearson correlation
#         pearson_denominator = sum([pair[1] for pair in similar_movies])
#         pearson_numerator = 0
#         for i in range(0, 5):
#             pearson_numerator += similar_movies[i][1] * original_rating[similar_movies[i][0] - 1][user_movie['userID'] - 1]
        
#         print("Predicting...", pearson_numerator, " / ", pearson_denominator)
#         predictions_ratings.at[i, 'Rating'] = (pearson_numerator / pearson_denominator)
#         print("Predicted rating : ", predictions_ratings.at[i, 'Rating'])
    
    return predictions_ratings
            
    
    
    pass


#####
##
## LATENT FACTORS
##
#####

def predict_latent_factors(movies, users, ratings, predictions):
    ## TO COMPLETE

    pass


#####
##
## FINAL PREDICTORS
##
#####

def predict_final(movies, users, ratings, predictions):
    ## TO COMPLETE

    pass


rating_predictions = predict_collaborative_filtering(movies_description,
                                                     users_description, ratings_description, predictions_description)


#####
##
## RANDOM PREDICTORS
## //!!\\ TO CHANGE
##
#####

#By default, predicted rate is a random classifier
def predict_random(movies, users, ratings, predictions):
    number_predictions = len(predictions)

    return [[idx, randint(1, 5)] for idx in range(1, number_predictions + 1)]

#####
##
## SAVE RESULTS
##
#####    


# ## //!!\\ TO CHANGE by your prediction function
# submission_read = pd.read_csv(submission_file)
# submission_read.columns = ['id', 'rating']

# predictions = predict_random(movies_description, users_description, ratings_description, predictions_description)
# print(predictions)
# predictions_df = pd.DataFrame(predictions, columns = ['Id', 'Rating'])

# submission_result = submission_read.merge(predictions_df, how='left', left_on='id', right_on='Id')
# submission_result.drop('id', axis=1, inplace=True)
# submission_result.drop('rating', axis=1, inplace=True)
# submission_result.head()
# submission_result.to_csv('submission.csv', index=False)