##### Import the required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
import seaborn as sns
import pickle
from collections import Counter
from datetime import datetime
from sortedcontainers import SortedList
# We can override the default matplotlib styles with those of Seaborn
sns.set()## Importing the relevant libraries


In [5]:
# Load the data from a .csv
ratings_data = pd.read_csv(os.path.join(os.path.pardir,'data','raw','ratings.csv'))
movies_data = pd.read_csv(os.path.join(os.path.pardir,'data','raw','movies.csv'))

In [6]:
ratings_df = ratings_data.copy()
movies_df = movies_data.copy()

In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [8]:
ratings_df.describe(include='all')

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [9]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
userId       100004 non-null int64
movieId      100004 non-null int64
rating       100004 non-null float64
timestamp    100004 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [10]:
# Make the user IDs go from 0 to N-1
ratings_df['userId'] = ratings_df['userId'] - 1

In [11]:
# Create a mapper for movie IDs
unique_movie_ids = list(ratings_df['movieId'].unique())
movie_id_mapper = {}
count = 0
for movie_id in unique_movie_ids:
    movie_id_mapper[movie_id] = count
    count +=1

# Add them to the databrame
ratings_df['movieId'] = ratings_df['movieId'].apply(lambda x: movie_id_mapper[x])
ratings_df = ratings_df.drop(columns='timestamp', axis=1)

In [12]:
N = ratings_df['userId'].max() + 1 # Number of users
M = ratings_df['movieId'].max() + 1 # number of movies


# user_id_count = dict(ratings_df['userId'].value_counts())
# movie_id_count = dict(ratings_df['movieId'].value_counts())

user_id_count = Counter(ratings_df['userId'])
movie_id_count = Counter(ratings_df['movieId'])

# Number of users and movies we would like to keep
n = 10000
m = 2000

user_ids = [u for u, c in user_id_count.most_common(n)]
movie_ids = [m for m, c in user_id_count.most_common(m)]

# Make a copy, otherwise the original df won't be overwritten
ratings_df_small = ratings_df[ratings_df['userId'].isin(user_ids) & ratings_df['movieId'].isin(movie_ids)].copy()

# Need to remake user_ids and movie_ids since they are no longer sequential
new_user_id_mapper = {}
i = 0
for old in user_ids:
    new_user_id_mapper[old] = i
    i += 1

new_movie_id_mapper = {}
j = 0
for old in movie_ids:
    new_movie_id_mapper[old] = j
    j += 1
    
    
ratings_df_small['userId'] = ratings_df_small['userId'].apply(lambda x: new_user_id_mapper[x])
ratings_df_small['movieId'] = ratings_df_small['movieId'].apply(lambda x: new_movie_id_mapper[x])

ratings_df_small.to_csv(os.path.join(os.path.pardir,'data','processed','small_ratings.csv'))


In [13]:
ratings_df_processed = ratings_df_small.copy()
N_ = ratings_df_processed['userId'].max() + 1 #Number of users
M_ = ratings_df_processed['movieId'].max() + 1 # Number of movies

# Split the df into train and test
ratings_df_train, ratings_df_test = train_test_split(ratings_df_processed, test_size=0.2)

# A dictionary to tell us, which users have rated which movie
user_movie = {}

# A dictionary to tell us, which movies have been rated by which users
movie_user = {}

# A dictionary to lookup ratings
user_movie_ratings = {}

cutoff = int(0.8 * len(ratings_df_processed))
count = 0
def update_user_movie_and_movie_user(row):
    global count
    count += 1
    if count % 100000 == 0:
        print("Processed: %.3f" % (float(count)/cutoff))
    i = int(row['userId'])
    j = int(row['movieId'])
    if i not in user_movie:
        user_movie[i] = [j]
    else:
        user_movie[i].append(j)
    
    if j not in movie_user:
        movie_user[j] = [i]
    else:
        movie_user[j].append(i)
    user_movie_ratings[(i,j)] = row['rating']

ratings_df_train.apply(update_user_movie_and_movie_user, axis=1)
user_movie

{310: [262,
  572,
  582,
  358,
  88,
  623,
  615,
  528,
  259,
  485,
  634,
  597,
  501,
  618,
  583,
  486,
  320,
  131,
  307,
  438,
  150,
  58,
  621,
  635,
  329,
  157,
  466,
  15,
  598,
  587,
  264,
  79,
  359,
  443,
  628,
  533],
 140: [120,
  422,
  167,
  620,
  580,
  539,
  99,
  147,
  485,
  461,
  189,
  520,
  654,
  471,
  184,
  16,
  177,
  24,
  315,
  317,
  69,
  39,
  586,
  393,
  632,
  66,
  587,
  621,
  641,
  669,
  304,
  572,
  497,
  267,
  532,
  142,
  195,
  135,
  478,
  88,
  645,
  458,
  492,
  385,
  452,
  466,
  275,
  463,
  138,
  171,
  663,
  578,
  582,
  18,
  527,
  547,
  274,
  262,
  53,
  490,
  6,
  634,
  121,
  309,
  57,
  4,
  501,
  118,
  215,
  633,
  178,
  161,
  421,
  43,
  535,
  622,
  655,
  616,
  398,
  629,
  639,
  194,
  116,
  339,
  9,
  442,
  62,
  329,
  263,
  638,
  407,
  73],
 260: [341,
  548,
  543,
  537,
  668,
  646,
  660,
  490,
  654,
  138,
  309,
  301,
  428,
  413,
  492,
  389

In [15]:
user_movie_ratings_test = {}

def update_user_movie_and_movie_user_test(row):
    global count
    count += 1
    if count % 100000 == 0:
        print("Processed: %.3f" % (float(count)/cutoff))
    i = int(row['userId'])
    j = int(row['movieId'])
    user_movie_ratings_test[(i, j)] = row['rating']


ratings_df_test.apply(update_user_movie_and_movie_user_test, axis=1)

with open(os.path.join(os.path.pardir,'data','interim','user_movie.json'), 'wb') as f:
    pickle.dump(user_movie, f)

with open(os.path.join(os.path.pardir,'data','interim','movie_user.json'), 'wb') as f:
    pickle.dump(movie_user, f)

with open(os.path.join(os.path.pardir,'data','interim','user_movie_rating.json'), 'wb') as f:
    pickle.dump(user_movie_ratings_test, f)
    

In [16]:
with open(os.path.join(os.path.pardir,'data','interim','user_movie.json'), 'rb') as f:
    user_movie = pickle.load(f)

with open(os.path.join(os.path.pardir,'data','interim','movie_user.json'), 'rb') as f:
    movie_user = pickle.load(f)

with open(os.path.join(os.path.pardir,'data','interim','user_movie_rating.json'), 'rb') as f:
    user_movie_ratings_test = pickle.load(f)


In [18]:
N = max(user_movie.keys()) + 1
# the test set may contain movies that the train set doesn't have
m1 = max(movie_user.keys())
m2 =  max([m for (u,m), r in user_movie_ratings_test.items()])
M = max(m1, m2) + 1

# To find the user-user similarity, you have to do O(N^2 * M) calculations
# In the real world you'd have to parallelize this
# Note: we only have to do half the calculations since w_ij is symetric
K = 25 # The number of neighbours we'd like to consider
limit = 5 # Minimum number of movies the users must have in common
neighbors = [] # store the neighbors in a list
averages = [] # each user's average rating 
deviations = [] #each user's deviation

for i in range(N):
    # find the 25 closes users to i
    try:
        movies_i = user_movie[i]
        movies_i_set = set(movies_i)

        # calculate average and deviation
        ratings_i = {movie:user_movie_ratings[(i, movie)] for movie in movies_i}
        avg_i = np.mean(list(ratings_i.values()))
        dev_i = {movie: (rating-avg_i) for movie, rating in ratings_i.items()}
        dev_i_values = np.array(list(dev_i.values()))
        
        # convert all the values in the deviations dictionary into a numpy array.
        # This is because the denominator in the Pearson correlation is the square root of the sum of squares
        # of the deviations.
        sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))
        
        # Save these for later use
        averages.append(avg_i)
        deviations.append(dev_i)
        sl = SortedList()
        for j in range(N):
            # Don't calculate the correlation with yourself
            if j !=i:
                movies_j = user_movie[j]
                movies_j_set = set(movies_j)
                common_movies = (movies_i_set & movies_j_set) # intersetion
                if len(common_movies) > limit:
                    # calculate average and deviation
                    ratings_j = {movie:user_movie_ratings[(j, movie)] for movie in movies_j}
                    avg_j = np.mean(list(ratings_j.values()))
                    dev_j = {movie: (rating-avg_j) for movie, rating in ratings_j.items()}
                    dev_j_values = np.array(list(dev_j.values()))

                    sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))

                    # calculate the correlation coefficient
                    numerator = sum(dev_i[m] for m in common_movies)
                    w_ij = numerator/(sigma_i * sigma_j)

                    # insert into a sorted list and truncate
                    # negate the weight, because the list is sorted in ascending
                    # maximum values (1) is "closest"
                    sl.add((w_ij, j))
                    if len(sl) > K:
                        del sl[-1]

            # store the neighbors
            neighbors.append(sl)
            
            # print out useful things
            # if i%i == 0:
                # print(i)

    except:
        pass




In [23]:
# using neighbors to calculate train and test MSE
def predict(i, m):
    # calculate the weighted sum of deviation
    numerator = 0
    denominator = 0
    
    for neg_w, j in neighbors[i]:
        # remember, the weight is sorted as its negative
        # so, the negative of the negative weight is positive
        try:
            numerator += -neg_w * deviations[j][m]
            denominator += abs(neg_w)
        except KeyError:
            # neighbors may not have rated the same movie
            # don't want to do dictionary lookup twice
            # so, just throw an exception
            pass
        if denominator == 0:
            prediction = averages[i]
        
        else:
            prediction = numerator / denominator * averages[i]
        prediction = min(5, prediction)
        prediction = max(0.5, prediction) # max rating is 0.5
        return prediction

train_predictions = []
train_targets = []

for (i, m), target in user_movie_ratings.items():
    # calculate the predictions of the movie
    prediction =predict(i, m)
    
    # save the prediction and target
    train_predictions.append(prediction)
    train_targets.append(target)
    

test_predictions = []
test_targets = []

for (i, m), target in user_movie_ratings_test.items():
    # calculate the predictions of the movie
    prediction =predict(i, m)
    
    # save the prediction and target
    test_predictions.append(prediction)
    test_targets.append(target)
    
# calculate accuracy
def mse(p, t):
    p = np.array(p)
    t = np.array(t)
    return np.mean((p - t) ** 2)

print("Train mse", mse(train_predictions, train_targets))

print("Test mse", mse(test_predictions, test_targets))
    

Train mse 1.4861617279246864
Test mse 1.5174217455908632
