In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("cleaned_reviews.csv")
df.head()

Unnamed: 0,user_id,product_id,rating
0,A1K0TEB46098KW,B0019FEOS4,5.0
1,ACVLDQU6PAM0V,B004KA6VHC,5.0
2,A2X1AMZCSA3H6S,B001BSBFUO,5.0
3,A26Z6YDMYP7EJO,B003ESRF5I,4.0
4,A1LFXR2GB56XWA,B001BCVY4W,5.0


In [4]:
ratings = df[['user_id', 'product_id', 'rating']].dropna()
ratings = ratings[ratings['rating'] > 0]

ratings.head()

Unnamed: 0,user_id,product_id,rating
0,A1K0TEB46098KW,B0019FEOS4,5.0
1,ACVLDQU6PAM0V,B004KA6VHC,5.0
2,A2X1AMZCSA3H6S,B001BSBFUO,5.0
3,A26Z6YDMYP7EJO,B003ESRF5I,4.0
4,A1LFXR2GB56XWA,B001BCVY4W,5.0


In [5]:
train_data, test_data = train_test_split(
    ratings,
    test_size=0.2,
    random_state=42
)

print("Train data size:", train_data.shape)
print("Test data size:", test_data.shape)

Train data size: (24000, 3)
Test data size: (6000, 3)


In [6]:
train_matrix = train_data.pivot_table(
    index='user_id',
    columns='product_id',
    values='rating'
).fillna(0)

train_matrix.head()

product_id,0006641040,7310172001,7310172101,B00002N8SM,B00004RAMS,B00004RAMX,B00004RAMY,B00004RYGX,B00004S1C5,B00004S1C6,...,B009KPU6LO,B009KUFALA,B009NTCO4O,B009PG8MVO,B009QEBGIQ,B009QNJRSS,B009RSR8HO,B009SA5NNW,B009UOFTUI,B009UOFU20
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#oc-R115TNMSPFT9I7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#oc-R11DNU2NBKQ23Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#oc-R1791MZMDMM68R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#oc-R19W3DMF9X0I7C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#oc-R1B9W981WGB5D0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(train_matrix.T)

item_similarity_df = pd.DataFrame(
    item_similarity,
    index=train_matrix.columns,
    columns=train_matrix.columns
)

item_similarity_df.head()

product_id,0006641040,7310172001,7310172101,B00002N8SM,B00004RAMS,B00004RAMX,B00004RAMY,B00004RYGX,B00004S1C5,B00004S1C6,...,B009KPU6LO,B009KUFALA,B009NTCO4O,B009PG8MVO,B009QEBGIQ,B009QNJRSS,B009RSR8HO,B009SA5NNW,B009UOFTUI,B009UOFU20
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0006641040,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7310172001,0.0,1.0,0.102757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7310172101,0.0,0.102757,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00002N8SM,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00004RAMS,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
def recommend_products(product_id, n=5):
    if product_id not in item_similarity_df.columns:
        return []

    similarity_scores = item_similarity_df[product_id].sort_values(ascending=False)
    return similarity_scores.iloc[1:n+1].index.tolist()

In [15]:
sample_product = train_data['product_id'].iloc[0]
recommend_products(sample_product, n=20)

['B002V8V7HE',
 'B002VAU9DU',
 'B002VBVCC6',
 'B002VC943E',
 'B002VH58R0',
 'B002VKMA1Y',
 'B002VROXD0',
 'B002VRQJC8',
 'B002V5D3YM',
 'B002VX8MGS',
 'B002VZUW0A',
 'B002VZY7UG',
 'B002W1D890',
 'B002W4DLJE',
 'B002W4KZM0',
 'B002W5SDEQ',
 'B002W7V184',
 'B002VRSJVM',
 'B002UUQHEQ',
 'B002UUQUAC']

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [17]:
def get_actual_products(test_data, product_id):
    return test_data[test_data['product_id'] == product_id]['product_id'].tolist()

In [18]:
# Pick one product from test data
test_product = test_data['product_id'].iloc[0]

# Get recommendations
recommended = recommend_products(test_product, n=5)

# Get actual products from test data
actual = get_actual_products(test_data, test_product)

recommended, actual

(['B0039LVLS2', 'B006N3IG4K', 'B003VXHGPK', 'B000CQ4DS2', 'B000LKZD4W'],
 ['B0039ZOZ86', 'B0039ZOZ86', 'B0039ZOZ86'])

In [19]:
# Create a combined list
all_products = list(set(recommended + actual))

# Binary vectors
y_true = [1 if p in actual else 0 for p in all_products]
y_pred = [1 if p in recommended else 0 for p in all_products]

y_true, y_pred

([0, 0, 1, 0, 0, 0], [1, 1, 0, 1, 1, 1])

In [20]:
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.0
Recall: 0.0
F1-score: 0.0


In [21]:
def hit_at_k(actual, recommended, k=10):
    recommended_k = recommended[:k]
    return int(any(item in actual for item in recommended_k))

In [22]:
hit_10 = hit_at_k(actual, recommended, k=10)
hit_20 = hit_at_k(actual, recommended, k=20)

print("Hit@10:", hit_10)
print("Hit@20:", hit_20)

Hit@10: 0
Hit@20: 0
