# Recommender System

Blah blah blah

## Import Libraries and Define File Paths

In [2]:
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict
from surprise import AlgoBase, Dataset, accuracy, Reader
from surprise.model_selection import cross_validate, train_test_split, KFold
from surprise import PredictionImpossible
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.prediction_algorithms.matrix_factorization import SVD, SVDpp, NMF
from surprise.prediction_algorithms.co_clustering import CoClustering

users_dir = "../dataset/utility/users"
items_dir = "../dataset/utility/items"

## Load Dataset

In [3]:
items_df = pd.read_csv(f"{items_dir}/itemset.csv", low_memory=False)
items_df.rename(columns={"Unnamed: 0": "ASIN"}, inplace=True)
items_df.set_index("ASIN", inplace=True)

df_utility = pd.read_csv(f"{users_dir}/utility_topn.csv")
df_utility.set_index("reviewerID", inplace=True)

reviews_df = pd.read_csv(f"{users_dir}/reviews.csv")
items_with_info_df = pd.read_csv(f"{items_dir}/itemset_with_info.csv", low_memory=False)

### Align ASINs between items matrix and utility matrix

In [4]:
diff_asins = list(set(df_utility.columns).difference(set(items_df.index.tolist())))
df_utility.drop(diff_asins, inplace=True, axis=1)
diff_asins = list(set(items_df.index.tolist()).difference(set(df_utility.columns)))
items_df.drop(index=diff_asins, inplace=True)
print(f"shape of df_utility: {df_utility.shape}")
print(f"shape of items_df: {items_df.shape}")

shape of df_utility: (9992, 31348)
shape of items_df: (31348, 3024)


# Try all algorithms
* Neighborhood-based collaborative filtering
* Latent-factor based collaborative filtering
* Content-based collaborative filtering

## Neighborhood-based Collaborative Filtering

From Surprise

In [5]:
melted_user_df = df_utility.copy().reset_index().melt(
    'reviewerID', var_name='ASIN', value_name='rating').dropna()
reader = Reader(rating_scale=(0,5))
dataset = Dataset.load_from_df(melted_user_df, reader)
algos = [KNNBasic(), KNNWithMeans(), KNNWithZScore(), SVD(), SVDpp(), NMF(), CoClustering()]

In [6]:
for algo in algos:
    cross_validate(algo, dataset, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8812  0.9297  0.8456  0.8876  0.8414  0.8771  0.0321  
MAE (testset)     0.5934  0.6081  0.5814  0.5927  0.5782  0.5908  0.0105  
Fit time          0.16    0.15    0.15    0.15    0.15    0.15    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing s

### Precision and Recall

Taken from https://github.com/NicolasHug/Surprise/blob/master/examples/precision_recall_at_k.py

In [7]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

kf = KFold(n_splits=5)

for algo in algos:
    print(f"computing precision and recall for {algo}")
    precision_list = []
    recall_list = []
    for trainset, testset in kf.split(dataset):
        algo.fit(trainset)
        predictions = algo.test(testset)
        precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

        # Precision and recall can then be averaged over all users
        precision_list.append(sum(prec for prec in precisions.values()) / len(precisions))
        recall_list.append(sum(rec for rec in recalls.values()) / len(recalls))
    average_precision = np.mean(precision_list)
    average_recall = np.mean(recall_list)
    print(f"average precision: {average_precision}")
    print(f"average recall: {average_recall}\n")

computing precision and recall for <surprise.prediction_algorithms.knns.KNNBasic object at 0x10e1ad790>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
average precision: 0.9138040042149631
average recall: 0.9138040042149631

computing precision and recall for <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x108110ad0>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing sim

### Get TopN Recommendations for a user

Taken from https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py

In [8]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algos[6].test(testset) # CoClustering

top_n = get_top_n(predictions, n=10)

# Get the recommended items for each user
recommendations = {}
for uid, user_ratings in top_n.items():
    recommendations[uid] = [iid for (iid, _) in user_ratings]

### Pick out users and explore their history and recommendations

In [9]:
reviews_df = pd.read_csv(f"{users_dir}/reviews.csv")
reviews_df.set_index('reviewerID', inplace=True)
items_with_info_df = pd.read_csv(f"{items_dir}/itemset_with_info.csv", low_memory=False)
items_with_info_df.set_index("ASIN", inplace=True)

In [10]:
def show_user_history_and_recos(user_id):
    """
    Display user purchase history and recommendations
    """
    asin = user_id.split('_')[-1]
    print("user history")
    display(reviews_df.loc[[asin]][['ProductName', 'reviewRating']])
    reco_list = recommendations[user_id]
    print("recommended items")
    display(items_with_info_df.loc[reco_list][["category", "name"]])
    print('\n')

In [12]:
user_ids = [x for x in list(recommendations.keys()) if len(x.split('_')[-1]) > 1][:10]
for user_id in user_ids:
    show_user_history_and_recos(user_id)

user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AEWGR7LXGVRR3PNE4NCGJKQ3FI2A,acer SA100 480GB SATA III 2.5 Inch Internal SS...,5.0
AEWGR7LXGVRR3PNE4NCGJKQ3FI2A,"Rayovac AAA Batteries, Triple A Battery Alkali...",5.0
AEWGR7LXGVRR3PNE4NCGJKQ3FI2A,Wrangler Men's Relaxed Fit Jean,5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0BLVMGD65,fashion,narwey travel toiletry bag for men and women t...
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B0C3QXNNZH,living room,toneed fluffy runner rug for bedroom living ro...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
B00FX4VAES,office supplies,"oxford ruled index cards, 3"" x 5"", white, line..."
1338344218,books,wings of fire: the dark secret: a graphic nove...
B00005333G,personal care,l'oreal paris skincare hydra-renewal face mois...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AGCU3Y67BRKGTN7GDNCWPP5KATKQ,"BAGSMART TSA Approved Toiletry Bag, 2 Pack Cle...",5.0
AGCU3Y67BRKGTN7GDNCWPP5KATKQ,"BAGSMART TSA Approved Toiletry Bag, 2 Pack Cle...",5.0
AGCU3Y67BRKGTN7GDNCWPP5KATKQ,Narwey Travel Toiletry Bag for Men and Women T...,5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0007CKMIQ,fashion,wrangler men's relaxed fit jean
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B0C3QXNNZH,living room,toneed fluffy runner rug for bedroom living ro...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
B00FX4VAES,office supplies,"oxford ruled index cards, 3"" x 5"", white, line..."
1338344218,books,wings of fire: the dark secret: a graphic nove...
B00005333G,personal care,l'oreal paris skincare hydra-renewal face mois...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AFNAUW24VEF4NUQDDMSFBJZZYBYQ,Toneed Fluffy Runner Rug for Bedroom Living Ro...,5.0
AFNAUW24VEF4NUQDDMSFBJZZYBYQ,ECLIPSE Kendall Modern Blackout Thermal Rod Po...,5.0
AFNAUW24VEF4NUQDDMSFBJZZYBYQ,Toneed Fluffy Runner Rug for Bedroom Living Ro...,5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0007CKMIQ,fashion,wrangler men's relaxed fit jean
B0BLVMGD65,fashion,narwey travel toiletry bag for men and women t...
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
B00FX4VAES,office supplies,"oxford ruled index cards, 3"" x 5"", white, line..."
1338344218,books,wings of fire: the dark secret: a graphic nove...
B00005333G,personal care,l'oreal paris skincare hydra-renewal face mois...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AHNWO5R6A2P4GSRUPEDJMO3XSPRQ,[Upgraded] USB Computer Speakers for Desktop P...,4.0
AHNWO5R6A2P4GSRUPEDJMO3XSPRQ,LQH TECH Portable Solid State Drive 512 GB PRO...,4.0
AHNWO5R6A2P4GSRUPEDJMO3XSPRQ,LQH TECH Portable Solid State Drive 512 GB PRO...,4.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B07B7K7N3P,office supplies,furmax office chair mid back swivel lumbar sup...
B07B7K7N3P,living room,furmax office chair mid back swivel lumbar sup...
B09T6KH1GW,kitchen,yetene 6 pack reusable liners for toaster oven...
B0B4ZSKPNL,bedroom,"air purifiers for bedroom, fulminare h13 true ..."
B0B4ZSKPNL,bathroom,"air purifiers for bedroom, fulminare h13 true ..."
B0BTT92MVQ,cleaning material,"szfixez electric spin scrubber, electric clean..."
B09XV5PLP7,electronic devices,"elecder i45 black wired headphones, immersive ..."
B0BQG1X8M3,kitchen,flasld heat resistant mat for air fryer site o...
B08PCXJVFH,bathroom,afloia air purifiers for home bedroom large ro...
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AFWSJQKTCNYNXAVN7L2OJ7YKGLOQ,Keeper of the Lost Cities The Graphic Novel Pa...,5.0
AFWSJQKTCNYNXAVN7L2OJ7YKGLOQ,Keeper of the Lost Cities The Graphic Novel Pa...,5.0
AFWSJQKTCNYNXAVN7L2OJ7YKGLOQ,"Oxford Ruled Index Cards, 3"" x 5"", White, Line...",5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0007CKMIQ,fashion,wrangler men's relaxed fit jean
B0BLVMGD65,fashion,narwey travel toiletry bag for men and women t...
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B0C3QXNNZH,living room,toneed fluffy runner rug for bedroom living ro...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
1338344218,books,wings of fire: the dark secret: a graphic nove...
B00005333G,personal care,l'oreal paris skincare hydra-renewal face mois...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AFAJMHZYNSVFVYWRTZT6HFBS4YUA,The Shadow and Bone Trilogy Boxed Set: Shadow ...,5.0
AFAJMHZYNSVFVYWRTZT6HFBS4YUA,Wings of Fire: The Dark Secret: A Graphic Nove...,5.0
AFAJMHZYNSVFVYWRTZT6HFBS4YUA,Wings of Fire: The Dark Secret: A Graphic Nove...,5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0007CKMIQ,fashion,wrangler men's relaxed fit jean
B0BLVMGD65,fashion,narwey travel toiletry bag for men and women t...
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B0C3QXNNZH,living room,toneed fluffy runner rug for bedroom living ro...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
B00FX4VAES,office supplies,"oxford ruled index cards, 3"" x 5"", white, line..."
B00005333G,personal care,l'oreal paris skincare hydra-renewal face mois...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AF5UIEV4N2WZ7QM3432BLJGVCG3A,"LifeStride Women's, Adley Boot",5.0
AF5UIEV4N2WZ7QM3432BLJGVCG3A,OKIMO Wireless Mouse for Laptop Computer Mouse...,5.0
AF5UIEV4N2WZ7QM3432BLJGVCG3A,L'Oreal Paris Skincare Hydra-Renewal Face Mois...,5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0007CKMIQ,fashion,wrangler men's relaxed fit jean
B0BLVMGD65,fashion,narwey travel toiletry bag for men and women t...
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B0C3QXNNZH,living room,toneed fluffy runner rug for bedroom living ro...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
B00FX4VAES,office supplies,"oxford ruled index cards, 3"" x 5"", white, line..."
1338344218,books,wings of fire: the dark secret: a graphic nove...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AHBFAII6REQ5U4WN3WEWZM4DM42A,SINOSSO Excellent Support Effect Office Chair ...,5.0
AHBFAII6REQ5U4WN3WEWZM4DM42A,SINOSSO Excellent Support Effect Office Chair ...,5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0007CKMIQ,fashion,wrangler men's relaxed fit jean
B0BLVMGD65,fashion,narwey travel toiletry bag for men and women t...
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B0C3QXNNZH,living room,toneed fluffy runner rug for bedroom living ro...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
B00FX4VAES,office supplies,"oxford ruled index cards, 3"" x 5"", white, line..."
1338344218,books,wings of fire: the dark secret: a graphic nove...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AHEE4MSXQUJ7N2MBJWADLTH37X2Q,SanDisk 2TB Extreme Portable SSD - Up to 1050M...,5.0
AHEE4MSXQUJ7N2MBJWADLTH37X2Q,SanDisk 2TB Extreme Portable SSD - Up to 1050M...,5.0
AHEE4MSXQUJ7N2MBJWADLTH37X2Q,SanDisk 2TB Extreme Portable SSD - Up to 1050M...,5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0007CKMIQ,fashion,wrangler men's relaxed fit jean
B0BLVMGD65,fashion,narwey travel toiletry bag for men and women t...
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B0C3QXNNZH,living room,toneed fluffy runner rug for bedroom living ro...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
B00FX4VAES,office supplies,"oxford ruled index cards, 3"" x 5"", white, line..."
1338344218,books,wings of fire: the dark secret: a graphic nove...




user history


Unnamed: 0_level_0,ProductName,reviewRating
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
AFL6QJ2DIU6TAO3Z5L52MI27KADA,"FlyDOIT Area Rugs for Bedroom Living Room, 4x6...",5.0
AFL6QJ2DIU6TAO3Z5L52MI27KADA,"FlyDOIT Area Rugs for Bedroom Living Room, 4x6...",5.0


recommended items


Unnamed: 0_level_0,category,name
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
B09DLJPBHN,fashion,chaoren leather reversible belts for men - dou...
B0007CKMIQ,fashion,wrangler men's relaxed fit jean
B0BLVMGD65,fashion,narwey travel toiletry bag for men and women t...
B0CHMFBNJZ,peripheral devices,"aula gaming keyboard, 104 keys gaming keyboard..."
B0CMWK8YGY,peripheral devices,epomaker ajazz ak820 pro 75% mechanical keyboa...
B071CPR43R,children,breathablebaby breathable mesh liner for full-...
B0C3QXNNZH,living room,toneed fluffy runner rug for bedroom living ro...
B073186MT4,fashion,wallflower women's luscious curvy bootcut mid-...
B0BQZHDGGW,computer components,lqh tech portable solid state drive 512 gb pro...
0735221103,books,the couple next door: a novel






## Latent-factor based Collaborative Filtering

### ALS

In [10]:
import pandas as pd
import numpy as np

In [7]:
users_dir = "../dataset/utility/users"

df_utility = pd.read_csv(f"{users_dir}/utility_topn.csv")
df_utility.set_index("reviewerID", inplace=True)

In [35]:
from scipy.linalg import lstsq


def get_RMSE(Fuser, Fitem, M):
    """
    Return RMSE
    """
    dot = np.dot(Fuser, Fitem)
    non_nan_idx_M = np.isfinite(M)
    nan_entries = np.count_nonzero(np.isnan(M))
    num_entries = np.size(M) - nan_entries
    diff = np.ma.array(dot - M, mask=(~non_nan_idx_M)).filled(0.)
    sum_of_squares = np.sum(np.square(diff))
    mean = sum_of_squares / num_entries
    RMSE = np.sqrt(mean)
    return RMSE

def recommend_als(user, df_utility, f_user, f_item, N):
    """
    Return indices of recommended items
    """
    utility_arr = df_utility.to_numpy()
    user_ratings = utility_arr[user]
    nan_indices = np.flatnonzero(np.isnan(user_ratings))

    reco_matrix = np.matmul(f_user, f_item.T)
    reco_user = reco_matrix[user]

    predicted_ratings = {}
    for i in range(len(reco_user)):
        if i in nan_indices:
            predicted_ratings[i+1] = reco_user[i]
    predicted_ratings = dict(sorted(
        predicted_ratings.items(), key=lambda item: item[1], reverse=True))
    return list(predicted_ratings.keys())[:N]

def als(M, k, tol):
    """
    Return Fuser and Fitem
    """
    M = np.array(M)
    print(M)
    Fuser = np.ones((len(M.T), k))
    Fitem = np.ones((len(M), k))
    RMSE = 100
    alt = False
    iters = 0
    while RMSE > tol:
        if not alt: # fix Fuser and compute Fitem (column-wise)
            coeffs = []
            for i in range(len(M)):
                nan_y_indices = np.flatnonzero(np.isnan(M[:, i]))
                nonnan_y_indices = np.flatnonzero(~np.isnan(M[:, i]))
                if len(nonnan_y_indices) > 0:
                    y = M[:, i][nonnan_y_indices]
                    print(len(nan_y_indices))
                    print(len(nonnan_y_indices))
                    Fuser_nonnan = np.delete(Fuser, nan_y_indices, axis=0)
                    coeff = np.array(lstsq(Fuser_nonnan, y)[0])
                    coeffs.append(coeff)
                Fitem = np.array(coeffs).T
                alt = True
        else: # fix Fitem and compute Fuser (row-wise)
            coeffs = []
            for i in range(len(M.T)):
                nan_y_indices = np.flatnonzero(np.isnan(M.T[:, i]))
                nonnan_y_indices = np.flatnonzero(~np.isnan(M.T[:, i]))
                if len(nonnan_y_indices) == 0:
                    y = M.T[:, i][nonnan_y_indices]
                    Fitem_nonnan = np.delete(Fitem.T, nan_y_indices, axis=0)
                    coeff = np.array(lstsq(Fitem_nonnan, y)[0])
                    coeffs.append(coeff)
                Fuser = np.array(coeffs)
                alt = False
                RMSE = get_RMSE(Fuser, Fitem, M)
                print(f"iteration: {iters} | RMSE: {RMSE}")
            iters += 1
    return Fuser, Fitem.T

In [36]:
complete_utility = als(df_utility, 2, 1)

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
9987
5


ValueError: Shape mismatch: a and b should have the same number of rows (23679 != 5).

## Content-based

In [24]:
import pandas as pd
import numpy as np

users_dir = "../dataset/utility/users"
items_dir = "../dataset/utility/items"

items_df = pd.read_csv(f"{items_dir}/itemset.csv", low_memory=False)
items_df.rename(columns={"Unnamed: 0": "ASIN"}, inplace=True)
items_df.set_index("ASIN", inplace=True)

df_utility = pd.read_csv(f"{users_dir}/utility_topn.csv")
df_utility.set_index("reviewerID", inplace=True)

reviews_df = pd.read_csv(f"{users_dir}/reviews.csv")
items_with_info_df = pd.read_csv(f"{items_dir}/itemset_with_info.csv", low_memory=False)

diff_asins = list(set(df_utility.columns).difference(set(items_df.index.tolist())))
df_utility.drop(diff_asins, inplace=True, axis=1)
diff_asins = list(set(items_df.index.tolist()).difference(set(df_utility.columns)))
items_df.drop(index=diff_asins, inplace=True)
print(f"shape of df_utility: {df_utility.shape}")
print(f"shape of items_df: {items_df.shape}")

shape of df_utility: (9992, 31348)
shape of items_df: (31348, 3024)


In [4]:
def compute_user_profile_agg_unary(df_utility, df_item_profiles, user):
    """
    Return user profile with unarized ratings
    """
    utility_user_arr = df_utility.loc[user].to_numpy()
    user_mean = np.nanmean(utility_user_arr)
    unarized_ratings = []
    for i in range(len(utility_user_arr)):
        if ~np.isnan(utility_user_arr[i]):
            if utility_user_arr[i] < user_mean:
                unarized_ratings.append(0)
            else:
                unarized_ratings.append(1)
        else:
            unarized_ratings.append(np.nan)
    unarized_ratings = np.array(unarized_ratings)
    ones = np.count_nonzero(unarized_ratings == 1)
    indices = list(np.nonzero(~np.isnan(unarized_ratings))[0])
    df_item_profiles_relevant = df_item_profiles.values[indices]
    return np.dot(
        np.array(unarized_ratings)[indices],
        df_item_profiles_relevant)/np.sum(ones)

In [5]:
user = "RedXepher82_AF6RFVIK2MSSR3PQTKQKAVK4HNRQ"

In [6]:
user_unary_profile = compute_user_profile_agg_unary(df_utility, items_df, user)

In [7]:
user_unary_profile

array([1.0, nan, nan, ..., nan, nan, nan], dtype=object)

In [22]:
from scipy.spatial.distance import cdist, cosine 

def recommend_agg_unary(df_utility, df_item_profiles, user_profile, user, L):
    df_utility.dropna(inplace=True, how="all", axis=0)
    df_item_profiles.dropna(inplace=True, how="all", axis=0)
    display(df_utility)
    display(df_item_profiles)
    user_ratings = df_utility.loc[user,:].values
    nan_mask = np.where(np.isnan(user_ratings))[0]

    print(len(user_ratings))
    print(len(nan_mask))
    print(df_item_profiles.shape)

    unrated_jokes_profile = df_item_profiles.values[nan_mask]
    
    distances = cdist([user_profile], unrated_jokes_profile, metric=cosine)[0]
    sim_indexes = np.argsort(distances, kind="stable")[:L]
    
    return df_utility.columns[nan_mask[sim_indexes]].values

In [23]:
recommend_agg_unary(df_utility, items_df, user_unary_profile, user, 10)

Unnamed: 0_level_0,B085FZXZJ5,B000IW9J20,B0C6RC2K82,B09XJD2ZND,B07PQT144T,B086JP7FX9,B07JFL43NX,B0C44FVT2Q,B0BWJZ2FHW,B0C61KBPC8,...,B086ML4XSB,B09XWTKCBY,B08V1T4JC1,B08X67YZBL,B09Z5NH6SJ,B07BYYJL71,B0CSCRWFGZ,B086M8V695,B07S6CRLVD,B0759FGJ3Q
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"D, fan of Orville Peck_AFEE3ZVOWMWRTV76JFLJCWW7N2EQ",,,,,,,,,,,...,,,,,,,,,,
Morgan _AFYUKELZBN5XNUCQ7STXY3XIOH7Q,,,,,,,,,,,...,,,,,,,,,,
#AskMissPatience_AGTMGTCCD2F45YP7J7TWBUEDFQYA,,,,,,,,,,,...,,,,,,,,,,
#EmptyNestReader_AHOZE5RMLR5EXZT7CTVYXK7MJUAA,,,,,,,,,,,...,,,,,,,,,,
***Toy Collector******Toy Collector***_AG5NFKDKQNEYV76GKH7BMXNTHKSQ,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
üå∫ Lynne E._AEYGPUCRKH7G4VM22FM3VAKSQ23Q,,,,,,,,,,,...,,,,,,,,,,
üåø ùîΩùïñùï£ùïü üåø_AGQNQ6OJOXQCPOTFSRTWAXYXNHLA,,,,,,,,,,,...,,,,,,,,,,
üë±üèª‚Äç‚ôÄÔ∏èAmy_AF4Y5UC5FYGVBSO746YJYJLQTQSA,,,,,,,,,,,...,,,,,,,,,,
üíúI Love Jellybeans_AHXMSF4ZL5IUD4QPIWCRQVMFJVGQ,,,,,,,,,,,...,,,,,,,,,,


Unnamed: 0_level_0,Home & Kitchen,Bedding,Comforters & Sets,Comforter Sets,Kids' Bedding,Baby Products,Nursery,Toddler Bedding,Bedding Sets,Blankets & Throws,...,Torches,Pest Control,Bug Zappers,Bistro Sets,Outdoor Curtains,Patio Furniture Covers,Furniture Set Covers,Figurine Lights,Storage Benches,Boot & Shoe Boxes
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0CTM6P5TW,,,,,,,,,,,...,,,,,,,,,,
B0CTH3XT3D,1.0,,,,,,,,,,...,,,,,,,,,,
B0CT2CC1YY,1.0,,,,,,,,,,...,,,,,,,,,,
B0CSSRBG48,1.0,,,,,,,,,,...,,,,,,,,,,
B0CS4CP75C,1.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B07QXK9WG5,1.0,,,,,,,,,,...,,,,,,,,,,
B0CHXTR17D,1.0,,,,,,,,,,...,,,,,,,,,,
B0C9MZWQ1D,1.0,,,,,,,,,,...,,,,,,,,,,
B0CLRHN99R,1.0,,,,,,,,,,...,,,,,,,,,,


31348
31347
(31081, 3024)


IndexError: index 31081 is out of bounds for axis 0 with size 31081