In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightfm import LightFM
from lightfm.data import Dataset
from scipy.sparse import coo_matrix
from lightfm.evaluation import precision_at_k,recall_at_k,auc_score,reciprocal_rank
from lightfm.cross_validation import random_train_test_split

In [2]:
dir = "../dataset/"

def load_dataset():
    items_df = pd.read_csv(f"{dir}/itemset_preprocessed.csv")
    items_df.set_index("ASIN", inplace=True)
 
    df_utility = pd.read_csv(f"{dir}/utility_topn.csv", index_col=0)
    df_utility.set_index("reviewerID", inplace=True)

    reviews_df = pd.read_csv(f"{dir}/reviews.csv")
    reviews_df.set_index('reviewerID', inplace=True)
    
    asins_df = pd.read_csv(f"{dir}/asin_product_mapping.csv")
    asins_df.set_index('ASIN', inplace=True)

    print(f"shape of df_utility: {df_utility.shape}")
    print(f"shape of items_df: {items_df.shape}")
    print(f"shape of asins_df: {asins_df.shape}")
    return items_df, df_utility, reviews_df, asins_df

items_df, df_utility, reviews_df, asins_df = load_dataset()

shape of df_utility: (445, 33510)
shape of items_df: (33510, 2429)
shape of asins_df: (33510, 1)


In [34]:
melted_user_df = df_utility.copy().reset_index().melt(
    'reviewerID', var_name='ASIN', value_name='rating').dropna()

# Remove Generic Accounts
reviewers_to_remove = [
    'Amazon CustomerAmazon Customer_',
    'Amazon Customer_',
    'Cliente de Amazon_',
    'Cliente Amazon_',
    'Kindle Customer_',
    'Client d\'Amazon_',
    'Amazon Customer',
    'Amazon Customer_',
    'Amazon Kunde_',
    'Amazon カスタマー_',
    'Cliente Kindle_',
    'Cliente de Kindle_'
]

udf = melted_user_df[~melted_user_df['reviewerID'].isin(reviewers_to_remove)]
udf.head(10)

Unnamed: 0,reviewerID,ASIN,rating
356,Richard_,b001f30182,5.0
945,Bob_,b00askv7fe,5.0
3227,Dave_,b01jcuyul8,5.0
3294,JJ_,b01jcuyul8,2.0
4758,Mitch_,b07dj594h7,5.0
4863,Tammy_,b07dj594h7,5.0
5801,AlexAlex_,b07h9rsg7l,1.0
6803,Elena_,b07kw82btn,1.0
9400,Bob_,b07v64g152,5.0
10793,David_,b07wghzmh1,5.0


In [52]:
top_100_df = df_utility.head(10_000)

melted_user_df_top_100 = top_100_df.reset_index().melt(
    'reviewerID', var_name='ASIN', value_name='rating').dropna()

reviewers_to_remove = [
    'Amazon CustomerAmazon Customer_',
    'Amazon Customer_',
    'Cliente de Amazon_',
    'Cliente Amazon_',
    'Kindle Customer_',
    'Client d\'Amazon_',
    'Amazon Customer',
    'Amazon Customer_',
    'Amazon Kunde_',
    'Amazon カスタマー_',
    'Cliente Kindle_',
    'Cliente de Kindle_'
]

udf_top_100 = melted_user_df_top_100[~melted_user_df_top_100['reviewerID'].isin(reviewers_to_remove)]
udf_top_100.head()

Unnamed: 0,reviewerID,ASIN,rating
356,Richard_,b001f30182,5.0
945,Bob_,b00askv7fe,5.0
3227,Dave_,b01jcuyul8,5.0
3294,JJ_,b01jcuyul8,2.0
4758,Mitch_,b07dj594h7,5.0


In [53]:
# Create a dataset object
dataset = Dataset()
dataset.fit(users=udf_top_100['reviewerID'].unique(),
            items=udf_top_100['ASIN'].unique())

# Build the interactions matrix
(interactions, _) = dataset.build_interactions(zip(udf_top_100['reviewerID'], 
                                                   udf_top_100['ASIN'],
                                                   udf_top_100['rating']))

In [54]:
Train, Test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

model = LightFM(loss='warp', no_components=30)

model.fit(Train, epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f1d77e99de0>

In [55]:
from tqdm import tqdm

k_values = range(1, 11)
auc_scores = []
precision_scores = []
recall_scores = []

for k in tqdm(k_values, desc="Processing Train"):
    auc = auc_score(model, Train, ).mean()
    precision = precision_at_k(model, Train, k=k).mean()
    recall = recall_at_k(model, Train, k=k).mean()
    
    auc_scores.append(auc)
    precision_scores.append(precision)
    recall_scores.append(recall)

print("Train Results:")
for k, auc, precision, recall in zip(k_values, auc_scores, precision_scores, recall_scores):
    print(f'K = {k}: AUC score: {auc}, Precision: {precision}, Recall: {recall}')

Processing Train: 100%|██████████| 10/10 [00:06<00:00,  1.61it/s]

Train Results:
K = 1: AUC score: 0.9999808073043823, Precision: 0.9909706711769104, Recall: 0.08588239994356696
K = 2: AUC score: 0.9999808073043823, Precision: 0.9887133240699768, Recall: 0.17146654972996728
K = 3: AUC score: 0.9999808073043823, Precision: 0.9902182817459106, Recall: 0.2574176388223588
K = 4: AUC score: 0.9999808073043823, Precision: 0.990406334400177, Recall: 0.3433493060273827
K = 5: AUC score: 0.9999808073043823, Precision: 0.991422176361084, Recall: 0.42943713760978985
K = 6: AUC score: 0.9999808073043823, Precision: 0.9890894889831543, Recall: 0.5129196268295183
K = 7: AUC score: 0.9999808073043823, Precision: 0.984521210193634, Recall: 0.5923275757162414
K = 8: AUC score: 0.9999808073043823, Precision: 0.9754514694213867, Recall: 0.6657867267241441
K = 9: AUC score: 0.9999808073043823, Precision: 0.9581139087677002, Recall: 0.7276735022770529
K = 10: AUC score: 0.9999808073043823, Precision: 0.9316027164459229, Recall: 0.7755372116059923





In [56]:
k_values = range(1, 11)
auc_scores = []
precision_scores = []
recall_scores = []

for k in tqdm(k_values, desc="Processing Test"):
    auc = auc_score(model, Test).mean()
    precision = precision_at_k(model, Test, k=k).mean()
    recall = recall_at_k(model, Test, k=k).mean()
    
    auc_scores.append(auc)
    precision_scores.append(precision)
    recall_scores.append(recall)

print("Test Results:")
for k, auc, precision, recall in zip(k_values, auc_scores, precision_scores, recall_scores):
    print(f'K = {k}: AUC score: {auc}, Precision: {precision}, Recall: {recall}')

Processing Test: 100%|██████████| 10/10 [00:06<00:00,  1.54it/s]

Test Results:
K = 1: AUC score: 0.3328709900379181, Precision: 0.0, Recall: 0.0
K = 2: AUC score: 0.3328709900379181, Precision: 0.0, Recall: 0.0
K = 3: AUC score: 0.3328709900379181, Precision: 0.0, Recall: 0.0
K = 4: AUC score: 0.3328709900379181, Precision: 0.0005854800692759454, Recall: 0.000585480093676815
K = 5: AUC score: 0.3328709900379181, Precision: 0.000936768192332238, Recall: 0.001053864168618267
K = 6: AUC score: 0.3328709900379181, Precision: 0.0011709601385518909, Recall: 0.001522248243559719
K = 7: AUC score: 0.3328709900379181, Precision: 0.0010036802850663662, Recall: 0.001522248243559719
K = 8: AUC score: 0.3328709900379181, Precision: 0.0008782201330177486, Recall: 0.001522248243559719
K = 9: AUC score: 0.3328709900379181, Precision: 0.0007806401699781418, Recall: 0.001522248243559719
K = 10: AUC score: 0.3328709900379181, Precision: 0.000936768192332238, Recall: 0.002107728337236534



