In [1]:
import numpy as np
import pandas as pd
import os.path
import random
from random import randint
from random import uniform
print(np.version.version)

1.16.5


In [2]:
# -*- coding: utf-8 -*-
"""
### NOTES
This file is an example of what your code should look like. It is written in Python 3.6.
To know more about the expectations, please refer to the guidelines.
"""

#####
##
## DATA IMPORT
##
#####

#Where data is located
movies_file = '../data/movies.csv'
users_file = '../data/users.csv'
ratings_file = '../data/ratings.csv'
predictions_file = '../data/predictions.csv'
submission_file = '../data/submission.csv'

# movies_file = r'/prediction/data/movies.csv'
# users_file = '/prediction/data/users.csv'
# ratings_file = '/prediction/data/ratings.csv'
# predictions_file = '/prediction/data/predictions.csv'
# submission_file = '/data/submission.csv'

# Read the data using pandas
movies_description = pd.read_csv(movies_file, delimiter=';', 
                                 dtype={'movieID':'int', 'year':'int', 'movie':'str'}, names=['movieID', 'year', 'movie'])
users_description = pd.read_csv(users_file, delimiter=';', 
                                dtype={'userID':'int', 'gender':'str', 'age':'int', 'profession':'int'}, names=['userID', 'gender', 'age', 'profession'])
ratings_description = pd.read_csv(ratings_file, delimiter=';', 
                                  dtype={'userID':'int', 'movieID':'int', 'rating':'float64'}, names=['userID', 'movieID', 'rating'])
predictions_description = pd.read_csv(predictions_file, delimiter=';', 
                                      dtype={'userID':'int', 'movieID':'int'}, names=['userID', 'movieID'], header=None)

In [3]:
#####
##
## COLLABORATIVE FILTERING
##
#####

# minimal elements to have a rating on for two movies to be considered a neighbour. 
# Otherwise a movie with one rating and rest all zeroes is a good neighbour to all movies with that rating by that one user

def predict_collaborative_filtering(movies, users, ratings, predictions, neighbours, min_periods, print_output = False, corr = None):
    
    predictions_ratings = []
    
    
#     Creating utility matrix 'u' : User x Movie -> Rating 
#     utility_matrix = ratings.pivot_table(index='movieID', columns='userID', values='rating',
#                                          fill_value=0)
    utility_matrix_none = ratings.pivot_table(index='userID', columns='movieID', values='rating',
                                         fill_value=None)
 
    # Add columns to the utility matrix for movies that are never rated
    cols = utility_matrix_none.columns
    for i in movies['movieID'].values:
        if i not in cols:
            utility_matrix_none[i] = np.nan

#     utility_matrix_none.to_csv('util.csv')
#     utility_matrix_none = pd.read_csv('util.csv')
    if corr is None:    
        corr = utility_matrix_none.corr(min_periods=min_periods)
    
    # I don't know why, but somehow saving this in a csv and loading it back up again fixes some errors
    corr.to_csv(r'tempcorr.csv')
    corr = pd.read_csv(r'tempcorr.csv')

    if print_output:
        print("\n>>>UTILITY MATRIX\n")
        print(utility_matrix_none)
        print("\n>>>CORR MATRIX\n")
        print(corr)
        print("\n>>>TO PREDICT")
        print(predictions)
        print("\n\n>>>STARTING PREDICTION \n\n")
        
    # Average rating
    mean_all_ratings = ratings['rating'].mean()
    
    # For every prediction to make (item/item, or movie/movie in this case)
    for i in range(len(predictions)):
        if i % 100 == 0:
            print(i, "/", len(predictions))
        user = predictions.iloc[i][0]
        movie = predictions.iloc[i][1]
        
        # Calculating baseline
        user_rating = utility_matrix_none.loc[user].values
        movie_rating = utility_matrix_none[movie].values

        mean_user_rating = np.nanmean(user_rating)
        mean_movie_rating = np.nanmean(movie_rating)

        b_x = mean_user_rating - mean_all_ratings
        b_i = mean_movie_rating - mean_all_ratings
        
        baseline = mean_all_ratings + b_i + b_x
        
        c = corr[['movieID', str(movie)]]
        
        # Sort the pearson correlation for all movies to the current movie to predict
        sorted_pearson = c.sort_values(by = [str(movie)], axis = 0, ascending = False)
        
        # Delete the movie itself, it should not be checked
        sorted_pearson = sorted_pearson[sorted_pearson.movieID != movie]
        
        # Get the movie id's of the sorted movies
        sorted_movies = sorted_pearson['movieID'].values
        sorted_corr = sorted_pearson[str(movie)].values
        
        # Add a certain amount of neirest neighbours, this amount is specified by the n_neighbours variable
        relevant_ratings = []
        for m in range(0, len(sorted_movies)):
            mov = sorted_movies[m]
            rating = utility_matrix_none.at[user, mov]
            if not np.isnan(rating):
                relevant_ratings.append((rating, sorted_corr[m], mov))
                if len(relevant_ratings) == neighbours:
                    break
        
        relevant_ratings = np.array(relevant_ratings)
        
        total_weight = 0
        #np.sum(relevant_ratings, axis = 0)[1]
        
        for x in relevant_ratings:
            total_weight = total_weight + abs(x[1])
            
        pred = 0
        for j in range(len(relevant_ratings)):
            current_movie = int(relevant_ratings[j, 2])
            calculate_similar_movie = utility_matrix_none[current_movie].values
            mean_similar_movie = np.nanmean(calculate_similar_movie)
            
            b_j = (mean_similar_movie - mean_all_ratings)
            
            b_xj = mean_all_ratings + b_j + b_x
            
            pred += (relevant_ratings[j, 0] - b_xj) * relevant_ratings[j, 1] / total_weight
        
        pred = baseline + pred
        
        # If the rating can't be calculated, set it to 3 as average
        if np.isnan(pred) or pred == 0:
            pred = mean_all_ratings
            
        if pred > 5:
            pred = 5
        
        if pred < 1:
            pred = 1
        
        if print_output:
            print("\n>>>>>>>>>>>>STARTING PREDICTION NUMBER", i + 1, "\nUser:", user, "\nMovie:", movie, "\n")
            print("\n>>SORTED PEARSON CORRELATION MATRIX\n")
            print(sorted_pearson)
            print("\n>>RELEVANT RATINGS AND THEIR WEIGHTS\n")
            print(relevant_ratings)
            print("\n>>FINAL PREDICTION: ", pred)
        predictions_ratings.append((i + 1, pred))
    return predictions_ratings

In [4]:
# Predict the submission and put it in csv
min_elements_non_zero = 27
n_neighbours = 30

preds = predict_collaborative_filtering(movies_description,
                                                     users_description, ratings_description, predictions_description, n_neighbours, min_elements_non_zero)
print(preds)
predictions_df = pd.DataFrame(preds, columns = ['Id', 'Rating'])
# missing_rating = [12472, 13332, 30741, 48165, 49314, 51435, 54181, 62588, 65517, 68123, 85972]
# for id_rating in missing_rating:
#     predictions_df.at[id_rating, 'Id']
    
predictions_df.to_csv('submission_baseline_enhanced.csv', index=False)

0 / 90019
100 / 90019
200 / 90019
300 / 90019
400 / 90019
500 / 90019
600 / 90019
700 / 90019
800 / 90019
900 / 90019
1000 / 90019
1100 / 90019
1200 / 90019
1300 / 90019
1400 / 90019
1500 / 90019
1600 / 90019
1700 / 90019
1800 / 90019
1900 / 90019
2000 / 90019
2100 / 90019
2200 / 90019
2300 / 90019
2400 / 90019
2500 / 90019
2600 / 90019
2700 / 90019
2800 / 90019
2900 / 90019
3000 / 90019
3100 / 90019
3200 / 90019
3300 / 90019
3400 / 90019
3500 / 90019
3600 / 90019
3700 / 90019
3800 / 90019
3900 / 90019
4000 / 90019
4100 / 90019
4200 / 90019
4300 / 90019
4400 / 90019
4500 / 90019
4600 / 90019
4700 / 90019
4800 / 90019
4900 / 90019
5000 / 90019
5100 / 90019
5200 / 90019
5300 / 90019
5400 / 90019
5500 / 90019
5600 / 90019
5700 / 90019
5800 / 90019
5900 / 90019
6000 / 90019
6100 / 90019
6200 / 90019
6300 / 90019
6400 / 90019
6500 / 90019
6600 / 90019
6700 / 90019
6800 / 90019
6900 / 90019
7000 / 90019
7100 / 90019
7200 / 90019
7300 / 90019
7400 / 90019
7500 / 90019
7600 / 90019
7700 / 9001



12500 / 90019
12600 / 90019
12700 / 90019
12800 / 90019
12900 / 90019
13000 / 90019
13100 / 90019
13200 / 90019
13300 / 90019
13400 / 90019
13500 / 90019
13600 / 90019
13700 / 90019
13800 / 90019
13900 / 90019
14000 / 90019
14100 / 90019
14200 / 90019
14300 / 90019
14400 / 90019
14500 / 90019
14600 / 90019
14700 / 90019
14800 / 90019
14900 / 90019
15000 / 90019
15100 / 90019
15200 / 90019
15300 / 90019
15400 / 90019
15500 / 90019
15600 / 90019
15700 / 90019
15800 / 90019
15900 / 90019
16000 / 90019
16100 / 90019
16200 / 90019
16300 / 90019
16400 / 90019
16500 / 90019
16600 / 90019
16700 / 90019
16800 / 90019
16900 / 90019
17000 / 90019
17100 / 90019
17200 / 90019
17300 / 90019
17400 / 90019
17500 / 90019
17600 / 90019
17700 / 90019
17800 / 90019
17900 / 90019
18000 / 90019
18100 / 90019
18200 / 90019
18300 / 90019
18400 / 90019
18500 / 90019
18600 / 90019
18700 / 90019
18800 / 90019
18900 / 90019
19000 / 90019
19100 / 90019
19200 / 90019
19300 / 90019
19400 / 90019
19500 / 90019
19600 

71100 / 90019
71200 / 90019
71300 / 90019
71400 / 90019
71500 / 90019
71600 / 90019
71700 / 90019
71800 / 90019
71900 / 90019
72000 / 90019
72100 / 90019
72200 / 90019
72300 / 90019
72400 / 90019
72500 / 90019
72600 / 90019
72700 / 90019
72800 / 90019
72900 / 90019
73000 / 90019
73100 / 90019
73200 / 90019
73300 / 90019
73400 / 90019
73500 / 90019
73600 / 90019
73700 / 90019
73800 / 90019
73900 / 90019
74000 / 90019
74100 / 90019
74200 / 90019
74300 / 90019
74400 / 90019
74500 / 90019
74600 / 90019
74700 / 90019
74800 / 90019
74900 / 90019
75000 / 90019
75100 / 90019
75200 / 90019
75300 / 90019
75400 / 90019
75500 / 90019
75600 / 90019
75700 / 90019
75800 / 90019
75900 / 90019
76000 / 90019
76100 / 90019
76200 / 90019
76300 / 90019
76400 / 90019
76500 / 90019
76600 / 90019
76700 / 90019
76800 / 90019
76900 / 90019
77000 / 90019
77100 / 90019
77200 / 90019
77300 / 90019
77400 / 90019
77500 / 90019
77600 / 90019
77700 / 90019
77800 / 90019
77900 / 90019
78000 / 90019
78100 / 90019
78200 

In [None]:
12472
13332
30741
48165
49314
51435
54181
62588
65517
68123
85972

In [None]:
submission_test = pd.read_csv(r'submission_baseline_test.csv')
test_ok = submission_test[submission_test['Rating'] > 5.5]
test_below = submission_test[submission_test['Rating'] < 1]
test_ok.info()
test_below.info()
print(submission_test.at[12471, 'Rating'])