In [1]:
import pandas as pd

data = pd.read_csv('http://antoninofurnari.it/downloads/movieratings.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 4 columns):
user_id       100003 non-null int64
item_id       100003 non-null int64
rating        100003 non-null int64
item_title    100003 non-null object
dtypes: int64(3), object(1)
memory usage: 3.1+ MB


Unnamed: 0,user_id,item_id,rating,item_title
0,0,50,5,Star Wars (1977)
1,290,50,5,Star Wars (1977)
2,79,50,4,Star Wars (1977)
3,2,50,5,Star Wars (1977)
4,8,50,5,Star Wars (1977)


In [2]:
#Count the number of unique users and item
print("Number of unique users: {}".format(data['user_id'].nunique()))
print("Number of unique items: {}".format(data['item_id'].nunique()))

Number of unique users: 944
Number of unique items: 1682


# Domanda 1

If each user had rated each film we would have a total of 944x1682.

In the case in which every user had rated each item we would have a dense matrix. Since it is very hard that every user rate every film we have a sparse matrix

In [3]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.25)

In [4]:
# build utility matrix using a pivot table

um = train_data.pivot_table(index='user_id', columns='item_id',values='rating')
um.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1670,1672,1673,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,,4.0,,3.0,5.0,,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


# Collaborative Filtering

Considering a user who has not rated an item, build a profile for each user by considering the columns subtracting the mean, find a set of N similar users, estimate the utility value computing a weighted average of rating given by similar users

In [5]:
# Example with this values

N = 2
user = 0
item = 1

In [6]:
# we need to replace all missing values with zeros and then normalize

profiles = um.fillna(0)-um.mean(0)
profiles.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1670,1672,1673,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-3.877676,-3.212766,-2.986301,-3.616352,-3.33871,-3.681818,-3.8,-4.00641,-3.916667,-3.9,...,-3.0,-2.0,-3.0,-2.0,-3.0,-1.0,-3.0,-2.0,-3.0,-3.0
1,1.122324,-3.212766,1.013699,-3.616352,-0.33871,1.318182,-3.8,-3.00641,1.083333,-3.9,...,-3.0,-2.0,-3.0,-2.0,-3.0,-1.0,-3.0,-2.0,-3.0,-3.0
2,0.122324,-3.212766,-2.986301,-3.616352,-3.33871,-3.681818,-3.8,-4.00641,-3.916667,-1.9,...,-3.0,-2.0,-3.0,-2.0,-3.0,-1.0,-3.0,-2.0,-3.0,-3.0
3,-3.877676,-3.212766,-2.986301,-3.616352,-3.33871,-3.681818,-3.8,-4.00641,-3.916667,-3.9,...,-3.0,-2.0,-3.0,-2.0,-3.0,-1.0,-3.0,-2.0,-3.0,-3.0
4,-3.877676,-3.212766,-2.986301,-3.616352,-3.33871,-3.681818,-3.8,-4.00641,-3.916667,-3.9,...,-3.0,-2.0,-3.0,-2.0,-3.0,-1.0,-3.0,-2.0,-3.0,-3.0


# Domanda 2

We need to subtract the mean because since we are treating blank values as negative ratings, subtracting the mean will make bad ratings negative and good rating positive

In [7]:
#Now that we have calculated the user profile we need to
#compute the cosine distance between this and the item features
import numpy as np

cd = lambda x,y: np.dot(x,y)/ (np.sqrt(np.dot(x,x)) * np.sqrt(np.dot(y,y)))

profiles.apply(lambda x: cd(profiles.loc[user],x), axis=1).head()
user


0

In [8]:
# We need to remove the cosine distance between the same user

distances = profiles.apply(lambda x: cd(profiles.loc[user], x), axis=1).drop(user)
distances.head()

user_id
1    0.901555
2    0.976302
3    0.988023
4    0.988614
5    0.959092
dtype: float64

In [9]:
# We need now to select only the users who have rated the item

selected_users = um.loc[:,item].dropna().index
selected_users

Int64Index([  1,   2,   5,   6,  10,  13,  15,  16,  20,  25,
            ...
            918, 919, 921, 922, 924, 932, 933, 934, 938, 941],
           dtype='int64', name='user_id', length=327)

# Domanda 3

We need to select the user who have rated the item in order to average the ratings given by the k users and compute the utility value

In [10]:
distances.loc[selected_users].sort_values().tail()
#the last row is the most similar

user_id
231    0.993000
723    0.993263
609    0.993567
289    0.993925
202    0.995183
dtype: float64

In [11]:
#take the 3 most similar
selected_distances = distances.loc[selected_users].sort_values()[-3:]
selected_distances

user_id
609    0.993567
289    0.993925
202    0.995183
dtype: float64

In [12]:
#store the user ids in an array
similar_users = selected_distances.index

In [13]:
#let's see how this users have rated the item
print(um.loc[similar_users[0],item],um.loc[similar_users[1],item], um.loc[similar_users[2],item])

1.0 3.0 3.0


In [14]:
#compute average rating

predicted_rating = (um.loc[similar_users, item]*selected_distances).sum()/selected_distances.sum()

predicted_rating

2.333774669005145

In [15]:
class CollaborativeFilter():
    def __init__(self, N):
        self.N = N
    
    def fit(self, um):
        self.um = um
        self.profiles = profiles = self.um.fillna(0)-self.um.mean(0)
    
    def predict(self, user, item):
        distances = self.profiles.apply(lambda x: cd(self.profiles.loc[user], x), axis=1).drop(user)
        selected_users = self.um.loc[:,item].dropna().index
        selected_distances = distances.loc[selected_users].sort_values()[-self.N:]
        similar_users = selected_distances.index
        predicted_rating = (self.um.loc[similar_users, item]*selected_distances).sum()/selected_distances.sum()
        return predicted_rating

In [16]:
cf = CollaborativeFilter(N = 3)
cf.fit(um)
print(cf.predict(user,item))
print(cf.predict(5,18))

2.333774669005145
1.9967481405133534


In [17]:
test_data.head()

Unnamed: 0,user_id,item_id,rating,item_title
95805,533,713,2,Othello (1995)
44227,177,12,5,"Usual Suspects, The (1995)"
92841,130,1013,4,Anaconda (1997)
34176,802,56,3,Pulp Fiction (1994)
11457,429,219,4,"Nightmare on Elm Street, A (1984)"


In [18]:
#function to iterate over the rows of test and compute ratings
from tqdm import tqdm

def predict(cf, test_data):
    predicted_ratings = []
    for i,ann in tqdm(test_data.iterrows(), total=len(test_data)):
        try:
            rating = cf.predict(ann['user_id'], ann['item_id'])
        except:
            rating = np.nan
        predicted_ratings.append(rating)
    return np.array(predicted_ratings)

In [19]:
#use a small portion of data
_, test_small = train_test_split(test_data, test_size=0.002)
len(test_small)

51

In [20]:
pr= predict(cf, test_small)

100%|██████████| 51/51 [00:28<00:00,  1.77it/s]


In [21]:
#measure performance with mae
def mae(y_true, y_pred):
    return (y_true-y_pred).abs().mean()
mae(test_small['rating'], pr)

0.9924501371275489

In [22]:
#We obtain this result since we have not many users.
cf50 = CollaborativeFilter(50)
cf50.fit(um)
pr_50 = predict(cf50, test_small)
mae(test_small['rating'], pr_50)

100%|██████████| 51/51 [00:34<00:00,  1.50it/s]


0.8886873433569799