In [9]:
import numpy as np
import pandas as pd
import scipy
from scipy.stats import pearsonr
from sklearn.preprocessing import normalize
import math

In [10]:
rs_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

In [11]:
ratings = pd.read_csv('ml-100k/ua.base', sep='\t', names=rs_cols, encoding='latin-1')

In [12]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


In [16]:
user_ratings = ratings.groupby('user_id')

In [17]:
user_ratings = {}
for k, v in ratings.groupby('user_id'):
    user_ratings[k] = dict(zip(v['movie_id'].values, v['rating'].values))
user_ratings_test = dict(list(user_ratings.items())[:1000])

In [18]:
class userUserBasedCF(object):
    def __init__(self, dataset):
        self.dataset = dataset
        self.means = {}
        for user, user_ratings in self.dataset.items():
            mean = np.mean(np.array(list(user_ratings.values())))
            self.means[user] = mean

    def pearson_correlation(self, user1, user2):
        """
        user1, user2: dictionaries
        """
        common_movies = sorted(set(user1).intersection(set(user2)))
        if len(common_movies) != 0 and len(common_movies) != 1:
            user1_ratings = np.squeeze(normalize(np.array([user1[movie] for movie in common_movies])[np.newaxis, :]))
            user2_ratings = np.squeeze(normalize(np.array([user2[movie] for movie in common_movies])[np.newaxis, :]))
            corr = pearsonr(user1_ratings, user2_ratings)[0]
        else:
            corr = 0
            #print("No common movies")
        return corr
    
    def knn(self, user, k):
        """
        user: user_id
        k: number of KNN
        """
        neighbours={}
        i = 0
        for user_id, user_data in self.dataset.items():
            if user_id == user:
                continue
            corr = self.pearson_correlation(self.dataset[user], user_data)
            neighbours[user_id] = corr
            i+=1
        sort = sorted(neighbours.items(), key=lambda x: x[1], reverse = True)
        knn = sort[:k]
        knn_user_ids = [user_id for user_id, user_corr in knn]
        print("KNN")
        return knn_user_ids
    
    def predict(self, user, movie_id, knn):
        """
        user: user_id
        movie_id: movie_id
        knn: knn_user_ids
        
        prediction = mean_rating_of_active_user + sum_over_knn(user_rating_for_i * pearson(user, active_user))/sum_over_knn(pearson(user, active_user))
        """
        mean_user_rating = self.means[user]
        print("Mean user rating for the user is ", mean_user_rating)
        iter_rating = 0.0
        pear_corr = 0.0
        for i, element in enumerate(knn):
            temp_corr = self.pearson_correlation(self.dataset[user], self.dataset[element])
            if math.isnan(temp_corr):
                continue
            if movie_id in self.dataset[element].keys():
                iter_rating += (self.dataset[element][movie_id]-self.means[element]) * temp_corr
            else:
                iter_rating += 0
            pear_corr += temp_corr
            
        pred = mean_user_rating + iter_rating/pear_corr
        return pred

In [19]:
cf = userUserBasedCF(user_ratings_test)

In [20]:
import warnings
warnings.filterwarnings('ignore')
knn = cf.knn(3, 100)
cf.predict(3, 45, knn)

KNN
Mean user rating for the user is  2.909090909090909


2.8676009731020873

In [31]:
knn

[27,
 34,
 36,
 41,
 42,
 45,
 46,
 48,
 481,
 56,
 236,
 29,
 24,
 52,
 14,
 10,
 44,
 49,
 57,
 47,
 8,
 54,
 15,
 33,
 5,
 9,
 11,
 12,
 17,
 20,
 22,
 23,
 25,
 32,
 37,
 40,
 51,
 53,
 55,
 13,
 7,
 1,
 4,
 35,
 2,
 43,
 18,
 16,
 21,
 26,
 30,
 58,
 6,
 31,
 39,
 28,
 19,
 50,
 38,
 60,
 65,
 66,
 71,
 75,
 81,
 82,
 84,
 74,
 83,
 86,
 96,
 101,
 117,
 88,
 68,
 72,
 91,
 64,
 85,
 62,
 76,
 113,
 59,
 104,
 116,
 90,
 69,
 109,
 73,
 94,
 67,
 93,
 95,
 97,
 98,
 103,
 106,
 108,
 114,
 63]

In [21]:
ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=rs_cols, encoding='latin-1')

In [22]:
n_users_base = ratings_base['user_id'].unique().max()
n_items_base = ratings_base['movie_id'].unique().max()

n_users_base,n_items_base

(943, 1682)

In [23]:
train_matrix = np.zeros((n_users_base, n_items_base))
for line in ratings_base.itertuples():
    train_matrix[line[1]-1,line[2]-1] = line[3]

In [25]:
train_matrix[3,45]

0.0

In [26]:
user_ratings

{1: {1: 5,
  2: 3,
  3: 4,
  4: 3,
  5: 3,
  6: 5,
  7: 4,
  8: 1,
  9: 5,
  10: 3,
  11: 2,
  12: 5,
  13: 5,
  14: 5,
  15: 5,
  16: 5,
  17: 3,
  18: 4,
  19: 5,
  21: 1,
  22: 4,
  23: 4,
  24: 3,
  25: 4,
  26: 3,
  27: 2,
  28: 4,
  29: 1,
  30: 3,
  31: 3,
  32: 5,
  34: 2,
  35: 1,
  36: 2,
  37: 2,
  38: 3,
  39: 4,
  40: 3,
  41: 2,
  42: 5,
  43: 4,
  44: 5,
  45: 5,
  46: 4,
  47: 4,
  48: 5,
  49: 3,
  50: 5,
  51: 4,
  52: 4,
  53: 3,
  54: 3,
  55: 5,
  56: 4,
  57: 5,
  58: 4,
  59: 5,
  60: 5,
  62: 3,
  63: 2,
  64: 5,
  65: 4,
  66: 4,
  67: 3,
  68: 4,
  69: 3,
  70: 3,
  71: 3,
  72: 4,
  73: 3,
  74: 1,
  75: 4,
  76: 4,
  77: 4,
  78: 1,
  79: 4,
  80: 4,
  81: 5,
  82: 5,
  83: 3,
  84: 4,
  85: 3,
  86: 5,
  87: 5,
  88: 4,
  89: 5,
  90: 4,
  91: 5,
  92: 3,
  93: 5,
  94: 2,
  95: 4,
  96: 5,
  97: 3,
  98: 4,
  99: 3,
  100: 5,
  101: 2,
  102: 2,
  103: 1,
  104: 1,
  105: 2,
  106: 4,
  107: 4,
  108: 5,
  109: 5,
  110: 1,
  111: 5,
  112: 1,
  113: 5,
  

In [27]:
user_ratings_test

{1: {1: 5,
  2: 3,
  3: 4,
  4: 3,
  5: 3,
  6: 5,
  7: 4,
  8: 1,
  9: 5,
  10: 3,
  11: 2,
  12: 5,
  13: 5,
  14: 5,
  15: 5,
  16: 5,
  17: 3,
  18: 4,
  19: 5,
  21: 1,
  22: 4,
  23: 4,
  24: 3,
  25: 4,
  26: 3,
  27: 2,
  28: 4,
  29: 1,
  30: 3,
  31: 3,
  32: 5,
  34: 2,
  35: 1,
  36: 2,
  37: 2,
  38: 3,
  39: 4,
  40: 3,
  41: 2,
  42: 5,
  43: 4,
  44: 5,
  45: 5,
  46: 4,
  47: 4,
  48: 5,
  49: 3,
  50: 5,
  51: 4,
  52: 4,
  53: 3,
  54: 3,
  55: 5,
  56: 4,
  57: 5,
  58: 4,
  59: 5,
  60: 5,
  62: 3,
  63: 2,
  64: 5,
  65: 4,
  66: 4,
  67: 3,
  68: 4,
  69: 3,
  70: 3,
  71: 3,
  72: 4,
  73: 3,
  74: 1,
  75: 4,
  76: 4,
  77: 4,
  78: 1,
  79: 4,
  80: 4,
  81: 5,
  82: 5,
  83: 3,
  84: 4,
  85: 3,
  86: 5,
  87: 5,
  88: 4,
  89: 5,
  90: 4,
  91: 5,
  92: 3,
  93: 5,
  94: 2,
  95: 4,
  96: 5,
  97: 3,
  98: 4,
  99: 3,
  100: 5,
  101: 2,
  102: 2,
  103: 1,
  104: 1,
  105: 2,
  106: 4,
  107: 4,
  108: 5,
  109: 5,
  110: 1,
  111: 5,
  112: 1,
  113: 5,
  

In [30]:
len(user_ratings_test.keys())

943