In [24]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pyarrow.feather as feather
from scipy.sparse import csr_matrix
import tqdm

In [3]:
df = feather.read_feather('data_train.feather')
df

Unnamed: 0,overall,reviewerID,asin
0,3.0,AZ1119BK1WNOP,B00007FFL9
1,5.0,A1VQL3LLOS1ZVX,B00007FFL9
2,5.0,A30WNM7RK2Z2VG,B00007FPNN
3,5.0,A30WOXPI820KFJ,B0000B35DA
4,5.0,A1U8VI6I2MFEU8,B0000B35DA
...,...,...,...
141008,4.0,A1MUFKXECGHPNR,B01HII35LC
141009,5.0,AVPZO0KSI5NBS,B01HJ79A4S
141010,5.0,A3SV0PTMJ54B0Z,B01HJC17Y4
141011,5.0,APDCPF4R6188U,B01HJC17Y4


In [19]:
df_test = feather.read_feather('data_test.feather')

In [21]:
asin_list = df['asin'].unique()
asin_index_mapping = {i: asin for i, asin in enumerate(asin_list)}

In [22]:
asin_list = df_test['asin'].unique()
asin_index_mapping_test = {i: asin for i, asin in enumerate(asin_list)}

In [4]:
pivot_users_df = pd.pivot_table(df, index="reviewerID", columns="asin", values="overall")
pivot_users_df = pivot_users_df.fillna(0)
pivot_users_df

asin,0871167042,B00007FFL9,B00007FPNN,B00007FPQZ,B00007FPTC,B00007GNHR,B0000B35DA,B00011V8YU,B00014ZHTY,B00023JMXQ,...,B01HI6GWI2,B01HI6I1HM,B01HI8EJE4,B01HIA9B0Y,B01HII35LC,B01HISBC4E,B01HJ79A4S,B01HJBEZ3A,B01HJC0WSQ,B01HJC17Y4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0281906287OAYIZ1EFS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A036513549TVB6QSFK04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0488385844WNV2OWO9X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A05467882E05R82HOCOI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0737687P6BTN9XQGAWA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZZJLO4BYM8WK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZLHDUWKJUU1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZMQ85DPFEG3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZOR0M3N3W00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
pivot_users_df = pd.pivot_table(df_test, index="reviewerID", columns="asin", values="overall")
pivot_users_df_test = pivot_users_df.fillna(0)

In [5]:
assert pivot_users_df.index.nunique() == pivot_users_df.shape[0]
assert pivot_users_df.keys().nunique() == pivot_users_df.shape[1]

In [7]:
mat_users = csr_matrix(pivot_users_df.values)
mat_users

<15184x31491 sparse matrix of type '<class 'numpy.float64'>'
	with 130584 stored elements in Compressed Sparse Row format>

In [10]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(mat_users)


In [16]:
def get_recommendations(user_index, data, model, n_recommendations, asin_index_mapping):
    distances, indices = model.kneighbors(data[user_index], n_neighbors=n_recommendations+1)
    recommendations = []
    for i in range(1, len(distances.flatten())):
        neighbor_index = indices.flatten()[i]
        for asin_idx in data[neighbor_index].nonzero()[1]:
            if asin_idx not in data[user_index].nonzero()[1]:
                asin_name = asin_index_mapping[asin_idx]
                if asin_name not in [asin_index_mapping[idx] for idx in data[user_index].nonzero()[1]]:
                    recommendations.append(asin_name)
                    if len(recommendations) == n_recommendations:
                        return recommendations
    return recommendations

In [17]:
user_index = 0 
n_recommendations = 10
recommended_asins = get_recommendations(user_index, mat_users, model_knn, n_recommendations, asin_index_mapping)
print(recommended_asins)

['B00F58NQJE', 'B00JIY3VS2', 'B00N5LJ4ZI', 'B00U12OU6W', 'B017E0MB4Q', 'B005GDRQ5S', 'B00786YAEQ', 'B013JGI9EU', 'B01DJ5A3AY', 'B00NM5GN2E']


Validation

In [25]:
def compute_mrr(ground_truth, predictions):
    for idx, item in enumerate(predictions):
        if item in ground_truth:
            return 1 / (idx + 1)
    return 0

def compute_precision(ground_truth, predictions):
    true_positives = sum(1 for item in predictions if item in ground_truth)
    return true_positives / len(predictions) if predictions else 0

def jaccard_index(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

In [31]:
user_validation_scores = []
n_recommendations = 10 

for user_index in range(min(10, pivot_users_df_test.shape[0])):
    target_user_id = pivot_users_df_test.index[user_index]
    recommended_asins = get_recommendations(user_index, mat_users, model_knn, n_recommendations, asin_index_mapping)

    user_row = pivot_users_df_test.loc[target_user_id].values
    non_zero_indices = np.nonzero(user_row)[0]

    actual_asins = [pivot_users_df_test.columns[idx] for idx in non_zero_indices]

    jaccard = jaccard_index(recommended_asins, actual_asins)
    mrr = compute_mrr(actual_asins, recommended_asins)
    precision = compute_precision(actual_asins, recommended_asins)

    user_validation_scores.append({
        'user': target_user_id,
        'jaccard_index': jaccard,
        'mean_reciprocal_rank': mrr,
        'precision': precision
    })

evaluation_results = pd.DataFrame(user_validation_scores)
print(evaluation_results)

                   user  jaccard_index  mean_reciprocal_rank  precision
0  A0281906287OAYIZ1EFS            0.0                     0        0.0
1  A036513549TVB6QSFK04            0.0                     0        0.0
2  A0488385844WNV2OWO9X            0.0                     0        0.0
3  A05467882E05R82HOCOI            0.0                     0        0.0
4  A0737687P6BTN9XQGAWA            0.0                     0        0.0
5  A09643921E186LVHYON2            0.0                     0        0.0
6  A0986263H7SX62P1SRDD            0.0                     0        0.0
7        A1002LJCM20EZ5            0.0                     0        0.0
8        A100ELBI8BSXR1            0.0                     0        0.0
9        A101CAMZDHU1V9            0.0                     0        0.0
