# Environment Setup

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from utils import *

from tqdm import tqdm

In [2]:
config = yaml_read('config.yaml')
config

{'dataset': {'users': './dataset/users.csv',
  'movies': './dataset/movies.csv',
  'ratings': './dataset/ratings.csv'}}

# Utils

# Dataset

We need to build: 
+ Utlity matrix: can be used to employ user-based recommender.
+ Item-item matrix: used for item-based recommender.

In [16]:
dconf = config['dataset']
movies_df = pd.read_csv(dconf['movies']).drop(columns=['Unnamed: 0'])
print(movies_df.info())
print("="*42)
ratings_df = pd.read_csv(dconf['ratings']).drop(columns=['Unnamed: 0'])
print(ratings_df.info())
print("="*42)
users_df = pd.read_csv(dconf['users']).drop(columns=['Unnamed: 0'])
print(users_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null 

## Utility matrix

We would build 2 types of utiltiy matrix: 

1. Binary user-item matrix which only considering the existence between the users and the items.
2. "Usual" user-item matrix which would record the ratings of movies according to an user. 

In [17]:
master_df = pd.merge(users_df, ratings_df, on='UserID', how='left')
master_df = pd.merge(master_df, movies_df, on='MovieID', how='left')
master_df = master_df.drop(columns=['Timestamp', 'Zip-code'])
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 8 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   UserID      1000209 non-null  int64 
 1   Gender      1000209 non-null  object
 2   Age         1000209 non-null  int64 
 3   Occupation  1000209 non-null  int64 
 4   MovieID     1000209 non-null  int64 
 5   Rating      1000209 non-null  int64 
 6   Title       1000209 non-null  object
 7   Genres      1000209 non-null  object
dtypes: int64(5), object(3)
memory usage: 61.0+ MB


In [35]:
# master_df['UserID'].unique()
len(master_df['MovieID'])

1000209

In [39]:
# Building binary utility matrix
n_user = 6040
n_movie = 3952 # The dataset is mis-indexed
bin_table = np.zeros((n_user, n_movie))
table_name = 'bin_utility_matrix.csv'

for user in range(n_user):
    user_idx = user + 1
    for movie in ratings_df[ratings_df['UserID'] == user_idx]['MovieID']: 
        bin_table[user, movie-1] = 1

In [58]:
movie_col = movies_df.iloc[:, [0, 1]]
movie_col_dict = {movie_col.iloc[row, 0]: movie_col.iloc[row, 1] for row in range(len(movie_col.iloc[:, 0]))}
movie_col_dict

{1: 'Toy Story (1995)',
 2: 'Jumanji (1995)',
 3: 'Grumpier Old Men (1995)',
 4: 'Waiting to Exhale (1995)',
 5: 'Father of the Bride Part II (1995)',
 6: 'Heat (1995)',
 7: 'Sabrina (1995)',
 8: 'Tom and Huck (1995)',
 9: 'Sudden Death (1995)',
 10: 'GoldenEye (1995)',
 11: 'American President, The (1995)',
 12: 'Dracula: Dead and Loving It (1995)',
 13: 'Balto (1995)',
 14: 'Nixon (1995)',
 15: 'Cutthroat Island (1995)',
 16: 'Casino (1995)',
 17: 'Sense and Sensibility (1995)',
 18: 'Four Rooms (1995)',
 19: 'Ace Ventura: When Nature Calls (1995)',
 20: 'Money Train (1995)',
 21: 'Get Shorty (1995)',
 22: 'Copycat (1995)',
 23: 'Assassins (1995)',
 24: 'Powder (1995)',
 25: 'Leaving Las Vegas (1995)',
 26: 'Othello (1995)',
 27: 'Now and Then (1995)',
 28: 'Persuasion (1995)',
 29: 'City of Lost Children, The (1995)',
 30: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 31: 'Dangerous Minds (1995)',
 32: 'Twelve Monkeys (1995)',
 33: 'Wings of Courage (1995)',
 34: 'Babe (1

In [62]:
col_name = [f'None_{i+1}' for i in range(n_movie)]
for idx in range(len(col_name)): 
    try: 
        col_name[idx] = movie_col_dict[idx + 1]
    except: 
        continue

print(col_name)

# bin_ult_df = pd.DataFrame(bin_table, columns=col_name)
# print(bin_ult_df.head())
# bin_ult_df.to_csv(f'./dataset/{table_name}')

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Father of the Bride Part II (1995)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Tom and Huck (1995)',
 'Sudden Death (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)',
 'Dracula: Dead and Loving It (1995)',
 'Balto (1995)',
 'Nixon (1995)',
 'Cutthroat Island (1995)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Four Rooms (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Money Train (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Assassins (1995)',
 'Powder (1995)',
 'Leaving Las Vegas (1995)',
 'Othello (1995)',
 'Now and Then (1995)',
 'Persuasion (1995)',
 'City of Lost Children, The (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Dangerous Minds (1995)',
 'Twelve Monkeys (1995)',
 'Wings of Courage (1995)',
 'Babe (1995)',
 'Carrington (1995)',
 'Dead Man Walking (1995)',
 'Across the Sea of Time (1995)',
 'It Takes Two (1995)',
 'Clueless (

In [91]:
# Building usual utility matrix
n_user = 6040
n_movie = 3952 # The dataset is mis-indexed
utl_table = np.zeros((n_user, n_movie))
table_name = 'utility_matrix.csv'

for user in tqdm(range(n_user)):
    user_idx = user + 1
    for movie in ratings_df[ratings_df['UserID'] == user_idx]['MovieID']: 
        utl_table[user, movie-1] = ratings_df[(ratings_df['UserID'] == user_idx) & (ratings_df['MovieID'] == movie)]['Rating']

  utl_table[user, movie-1] = ratings_df[(ratings_df['UserID'] == user_idx) & (ratings_df['MovieID'] == movie)]['Rating']
100%|██████████| 6040/6040 [41:33<00:00,  2.42it/s]  


In [92]:
# col_name = [f'None_{i+1}' for i in range(n_movie)]
# for idx in range(len(col_name)): 
#     try: 
#         col_name[idx] = movie_col_dict[idx + 1]
#     except: 
#         continue

# print(col_name)

# ult_df = pd.DataFrame(utl_table, columns=col_name)
# print(ult_df.head())
# ult_df.to_csv(f'./dataset/{table_name}')

['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)', 'Heat (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)', 'Sudden Death (1995)', 'GoldenEye (1995)', 'American President, The (1995)', 'Dracula: Dead and Loving It (1995)', 'Balto (1995)', 'Nixon (1995)', 'Cutthroat Island (1995)', 'Casino (1995)', 'Sense and Sensibility (1995)', 'Four Rooms (1995)', 'Ace Ventura: When Nature Calls (1995)', 'Money Train (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Assassins (1995)', 'Powder (1995)', 'Leaving Las Vegas (1995)', 'Othello (1995)', 'Now and Then (1995)', 'Persuasion (1995)', 'City of Lost Children, The (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Dangerous Minds (1995)', 'Twelve Monkeys (1995)', 'Wings of Courage (1995)', 'Babe (1995)', 'Carrington (1995)', 'Dead Man Walking (1995)', 'Across the Sea of Time (1995)', 'It Takes Two (1995)', 'Clueless (1995)', 'Cry, the Beloved Country (199

# Experiments

## Item-based Collaborative Filtering

In [105]:
bin_df = pd.read_csv('./dataset/bin_utility_matrix.csv').drop(columns=['Unnamed: 0'])
bin_df.head()

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
i2_df = pd.DataFrame(index=bin_df.columns, 
                    columns=bin_df.columns)
i2_df.head().iloc[:,0:6]

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995)
Toy Story (1995),,,,,,
Jumanji (1995),,,,,,
Grumpier Old Men (1995),,,,,,
Waiting to Exhale (1995),,,,,,
Father of the Bride Part II (1995),,,,,,


In [107]:
from scipy.spatial.distance import cosine

# Calculate similarily
for i in tqdm(range(0, len(i2_df.columns))):
    # Loop through the columns for each column
    for j in range(0, len(i2_df.columns)):
        # Calculate similarity
        i2_df.iloc[i, j] = 1 - cosine(bin_df.iloc[:, i], bin_df.iloc[:, j])


  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 3952/3952 [1:17:58<00:00,  1.18s/it]


In [112]:
i2_df.head(5)

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
Toy Story (1995),1.0,0.406916,0.288038,0.188484,0.271654,0.367143,0.320917,0.135706,0.126011,0.396148,...,0.109734,0.029256,0.103731,0.096546,0.056215,0.323606,0.197581,0.098537,0.041633,0.190486
Jumanji (1995),0.406916,1.0,0.264313,0.16222,0.265632,0.283338,0.287671,0.19695,0.183247,0.411925,...,0.077097,0.02518,0.092157,0.101978,0.081485,0.240563,0.160301,0.087376,0.029859,0.136139
Grumpier Old Men (1995),0.288038,0.264313,1.0,0.217497,0.32434,0.222284,0.326998,0.105387,0.158509,0.271677,...,0.070023,0.060985,0.069751,0.1052,0.086344,0.208755,0.131165,0.074691,0.021696,0.116102
Waiting to Exhale (1995),0.188484,0.16222,0.217497,1.0,0.298679,0.152595,0.247282,0.065106,0.098723,0.151852,...,0.062622,0.025565,0.011696,0.046018,0.031025,0.138452,0.118769,0.031311,0.03638,0.105129
Father of the Bride Part II (1995),0.271654,0.265632,0.32434,0.298679,1.0,0.176308,0.32863,0.09868,0.172653,0.251615,...,0.035593,0.058124,0.088638,0.052311,0.023512,0.176194,0.106676,0.055368,0.01838,0.120982


In [110]:
# i2_df.to_csv('./dataset/item_item_dataset.csv')

In [124]:
top_k = 10
data_neighbors = {}
for i in tqdm(range(0, len(i2_df.columns))): 
    data_neighbors[i2_df.columns[i]] = i2_df.iloc[0: , i].sort_values(ascending=False)[: top_k].index.to_list()

data_neighbors

100%|██████████| 3952/3952 [00:15<00:00, 256.98it/s]


{'Toy Story (1995)': ['Toy Story (1995)',
  'Woman in Question, The (1950)',
  'Spanish Fly (1998)',
  'Hedd Wyn (1992)',
  'Stefano Quantestorie (1993)',
  'None_817',
  'Breaks, The (1999)',
  'Magic Hunter (1994)',
  'Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)',
  'Somebody to Love (1994)'],
 'Jumanji (1995)': ['None_1403',
  'None_1634',
  'Two Crimes (1995)',
  'Goodbye, 20th Century (Zbogum na dvadesetiot vek) (1998)',
  'War Stories (1995)',
  'Two Friends (1986)',
  'Halfmoon (Paul Bowles - Halbmond) (1995)',
  "Lilian's Story (1995)",
  'Of Love and Shadows (1994)',
  'Century of Cinema, A (1994)'],
 'Grumpier Old Men (1995)': ['None_1072',
  'Goodbye, 20th Century (Zbogum na dvadesetiot vek) (1998)',
  'Garcu, Le (1995)',
  'Honigmond (1996)',
  'None_740',
  'Men of Means (1998)',
  'British Intelligence (1940)',
  'Vermin (1998)',
  'None_1737',
  'None_221'],
 'Waiting to Exhale (1995)': ['None_1745',
  'None_221',
  'Under the Domin 

## User-based Filtering

In [4]:
ult_df = pd.read_csv('./dataset/utility_matrix.csv').drop(columns=['Unnamed: 0'])
ult_df.head()

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#  Function to get similarity scores

def similarity_score(history, similarities):
    return sum(history*similarities) / sum(similarities)


In [7]:
ult_sims = pd.DataFrame(index=ult_df.index,
                        columns=ult_df.index)
ult_sims = ult_sims.fillna(0.)
ult_sims.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6030,6031,6032,6033,6034,6035,6036,6037,6038,6039
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
range(len(ult_sims.columns))

range(0, 6040)

In [10]:
from scipy.spatial.distance import cosine

# Calculate similarily
for i in tqdm(range(len(ult_sims.columns))):
    # Loop through the columns for each column
    for j in range(len(ult_sims.columns)):
        # Calculate similarity between user i and user j
        ult_sims.iloc[i, j] = 1 - cosine(ult_df.iloc[i, :], ult_df.iloc[j, :])

100%|██████████| 6040/6040 [4:10:20<00:00,  2.49s/it]  


In [11]:
ult_sims.to_csv('./dataset/user_user_dataset.csv')

In [12]:
ult_sims.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6030,6031,6032,6033,6034,6035,6036,6037,6038,6039
0,1.0,0.096382,0.12061,0.132455,0.090158,0.179222,0.059678,0.138241,0.226148,0.255288,...,0.170588,0.082006,0.069807,0.033663,0.114877,0.186329,0.135979,0.0,0.174604,0.13359
1,0.096382,1.0,0.151479,0.171176,0.114394,0.100865,0.305787,0.203337,0.190198,0.226861,...,0.112503,0.091222,0.268565,0.014286,0.183384,0.228241,0.206274,0.066118,0.066457,0.218276
2,0.12061,0.151479,1.0,0.151227,0.062907,0.074603,0.138332,0.077656,0.126457,0.213655,...,0.09296,0.125864,0.161507,0.0,0.097308,0.143264,0.107744,0.120234,0.094675,0.133144
3,0.132455,0.171176,0.151227,1.0,0.045094,0.013529,0.130339,0.100856,0.093651,0.120738,...,0.163629,0.093041,0.382803,0.0,0.082097,0.170583,0.127464,0.062907,0.064634,0.137968
4,0.090158,0.114394,0.062907,0.045094,1.0,0.047449,0.126257,0.220817,0.26133,0.117052,...,0.100652,0.035732,0.061806,0.054151,0.179083,0.293365,0.172686,0.020459,0.027689,0.241437


In [None]:
ult_neighbors = {}

for i in range(0, len(ult_sims.index)):
    for j in range(1, len(ult_sims.columns)):
        user = ult_df.index[i]
        product = ult_df.columns[j]
        if type(ult_neighbors[user]) != list: 
            ult_neighbors[user] = []
        ult_neighbors[user].append(product)
        if ult_df.iloc[i, j] != 0:
            ult_sims.iloc[i, j] = 0
        else:
            product_top_names = ult_neighbors[user][1:10]
            product_top_sims = ult_df.iloc[0: , i].sort_values(ascending=False)[: top_k].index.to_list()
            user_purchases = ult_df.iloc[user, product_top_names]

            ult_sims.iloc[i, j] = similarity_score(user_purchases, product_top_sims)



In [138]:
ult_df.columns[0]

'Toy Story (1995)'