### 1. Import libraries & Load data meta_Appliances.json & Appliances.json

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import random
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
from collections import defaultdict

In [None]:
df = pd.read_json("meta_Appliances.json", lines=True)
df = df.drop(columns=["feature", "fit", "description", "imageURL", "imageURLHighRes", "tech1", "tech2", "rank", "similar_item"])

In [None]:
df['category'] = df['category'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df.astype(str).drop_duplicates(inplace=True)
df.head()

Unnamed: 0,category,title,also_buy,brand,also_view,details,main_cat,date,price,asin
0,"Appliances, Refrigerators, Freezers & Ice Makers",Tupperware Freezer Square Round Container Set ...,[],Tupperware,[],{},Appliances,"November 19, 2008",,7301113188
1,"Appliances, Refrigerators, Freezers & Ice Makers",2 X Tupperware Pure &amp; Fresh Unique Covered...,[],Tupperware,[B004RUGHJW],{},Appliances,"June 5, 2016",$3.62,7861850250
2,"Appliances, Parts &amp; Accessories",The Cigar - Moments of Pleasure,[],The Cigar Book,"[B01HCAVSLK, 1632206579]",{},Amazon Home,,$150.26,8792559360
3,"Appliances, Parts & Accessories",Caraselle 2X 50G Appliance Descalene,[],Caraselle,[],{},Tools & Home Improvement,"December 17, 2014",.a-box-inner{background-color:#fff}#alohaBuyBo...,9792954481
4,"Appliances, Parts & Accessories, Range Parts &...",EATON Wiring 39CH-SP-L Arrow Hart 1-Gang Chrom...,[],EATON Wiring,[],{},Tools & Home Improvement,"January 16, 2007",$3.43,B00002N5EL


In [None]:
df2 = pd.read_json("Appliances.json", lines=True)
df2 = df2.drop(columns=["style", "unixReviewTime", "image"])

In [None]:
df2.astype(str).drop_duplicates(inplace=True)
df2.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary
0,5,2.0,False,"11 27, 2013",A3NHUQ33CFH3VM,1118461304,Greeny,Not one thing in this book seemed an obvious o...,Clear on what leads to innovation
1,5,,False,"11 1, 2013",A3SK6VNBQDNBJE,1118461304,Leif C. Ulstrup,I have enjoyed Dr. Alan Gregerman's weekly blo...,Becoming more innovative by opening yourself t...
2,5,,False,"10 10, 2013",A3SOFHUR27FO3K,1118461304,Harry Gilbert Miller III,Alan Gregerman believes that innovation comes ...,The World from Different Perspectives
3,5,,False,"10 9, 2013",A1HOG1PYCAE157,1118461304,Rebecca Ripley,"Alan Gregerman is a smart, funny, entertaining...",Strangers are Your New Best Friends
4,5,10.0,False,"09 7, 2013",A26JGAM6GZMM4V,1118461304,Robert Morris,"As I began to read this book, I was again remi...","How and why it is imperative to engage, learn ..."


### 2. Tiền xử lý dữ liệu sơ lược trước khi chia tập train-test

In [None]:
def clean_price(price):
    try:
        return float(price.replace('$', '').replace(',', '').strip())
    except:
        return np.nan

df['price'] = df['price'].apply(clean_price)
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [None]:
df2['reviewTime'] = pd.to_datetime(df2['reviewTime'], errors='coerce')
df2 = df2.dropna(subset=['asin', 'reviewerID'])
df2['overall'] = df2['overall'].astype(float)

### 3. Merge 2 dataframe lại

In [None]:
df_merged = df2.merge(df[['asin', 'title', 'brand', 'price', 'category']], on='asin', how='left')
df_merged_valid = df_merged.dropna(subset=['title', 'category'])

### 4. Trước khi recommend, chọn ngẫu nhiên 50 users, xem họ từng mua mặt hàng gì, xuất ra csv

In [None]:
valid_users = df_merged_valid['reviewerID'].unique()
if len(valid_users) < 50:
    print(f"Error: Only {len(valid_users)} users with purchase history found!")
    exit()
random.seed(42)  # For reproducibility

In [None]:
# For any user you want to random
selected_users = random.sample(list(valid_users), 50)

In [None]:
purchase_history = df_merged_valid[df_merged_valid['reviewerID'].isin(selected_users)][['reviewerID', 'reviewerName', 'category', 'title', 'brand', 'price']]
purchase_history['price'] = purchase_history['price'].apply(lambda x: 'Unknown' if pd.isna(x) else f"{x:.2f}")
purchase_history['brand'] = purchase_history['brand'].fillna('Unknown')
purchase_history = purchase_history.drop_duplicates()

In [None]:
purchase_history = purchase_history[~purchase_history['reviewerName'].str.contains("Amazon Customer", na=False)]

In [None]:
purchase_history.head()

Unnamed: 0,reviewerID,reviewerName,category,title,brand,price
6925,A255JHGWY8PJHY,Diesel Dad,"Appliances, Refrigerators, Freezers & Ice Make...",Haier HBF05EBSS Draft Beer Dispenser,Haier,Unknown
26315,AEZQGS6WCAVA9,desarae craig,"Appliances, Parts & Accessories",BestAir ES12 Kenmore 14911 / Emerson HDC-12 Re...,BestAir,Unknown
27144,A5FE42ROUWIY8,Clyde Massaro,"Appliances, Ranges, Ovens &amp; Cooktops, Rang...","Broan QS130SS 220 CFM Under Cabinet Hood, 30-...",Broan,Unknown
28899,A1PUD5DRQR84B9,Papa Homer,"Appliances, Parts & Accessories, Humidifier Pa...",Aprilaire 35 Replacement Water Panel for April...,Aprilaire,12.31
29959,A2XAXPYE4Z9Y5O,Michael,"Appliances, Parts & Accessories, Humidifier Pa...",Aprilaire 35 Replacement Water Panel for April...,Aprilaire,12.31


In [None]:
purchase_history.to_csv('purchase_history.csv', index=False)

### 5. Mô hình UltraGCN cho Recommend

##### 5.1. Chuẩn bị dữ liệu

In [None]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df_merged_valid['user'] = user_encoder.fit_transform(df_merged_valid['reviewerID'])
df_merged_valid['item'] = item_encoder.fit_transform(df_merged_valid['asin'])

interactions = df_merged_valid[df_merged_valid['reviewerID'].isin(selected_users)][['user', 'item', 'overall', 'reviewerID', 'reviewerName']]

In [None]:
user_ids = interactions['user'].unique()
user_map = {uid: idx for idx, uid in enumerate(user_ids)}

item_ids = interactions['item'].unique()
item_map = {iid: idx for idx, iid in enumerate(item_ids)}

num_users = len(user_map)
num_items = len(item_map)

interactions['user_id'] = interactions['user'].map(user_map)
interactions['item_id'] = interactions['item'].map(item_map)

#### 5.2. Tạo tập edge và edge weight

In [None]:
user_freq = defaultdict(int)
item_freq = defaultdict(int)

for row in interactions.itertuples():
    user_freq[row.user_id] += 1
    item_freq[row.item_id] += 1

edge_index = []
edge_weight = []

for row in interactions.itertuples():
    u, i = row.user_id, row.item_id
    edge_index.append([u, num_users + i])
    edge_index.append([num_users + i, u])

    # Weight công thức UltraGCN
    w = 1.0 / ((user_freq[u]**0.5) * (item_freq[i]**0.5))
    edge_weight.append(w)
    edge_weight.append(w)

edge_index = torch.tensor(edge_index).t().contiguous()
edge_weight = torch.tensor(edge_weight, dtype=torch.float32)

#### 5.3. Xây dựng mô hình UltraGCN

In [None]:
class UltraGCN(nn.Module):
    def __init__(self, num_nodes, emb_dim):
        super().__init__()
        self.emb = nn.Embedding(num_nodes, emb_dim)
        nn.init.xavier_uniform_(self.emb.weight)

    def forward(self, edge_index, edge_weight):
        x = self.emb.weight
        row, col = edge_index
        norm = edge_weight

        out = torch.zeros_like(x)
        out.index_add_(0, row, x[col] * norm.unsqueeze(1))
        return out

#### 5.4. Train mô hình

In [None]:
embedding_dim = 32
model_ultra = UltraGCN(num_users + num_items, embedding_dim)
optimizer = torch.optim.Adam(model_ultra.parameters(), lr=1e-3)

for epoch in range(10):
    model_ultra.train()
    optimizer.zero_grad()
    emb = model_ultra(edge_index, edge_weight)
    loss = torch.norm(emb)  # Regularization loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

Epoch 0: Loss = 6.5654
Epoch 1: Loss = 6.5146
Epoch 2: Loss = 6.4640
Epoch 3: Loss = 6.4136
Epoch 4: Loss = 6.3634
Epoch 5: Loss = 6.3133
Epoch 6: Loss = 6.2634
Epoch 7: Loss = 6.2137
Epoch 8: Loss = 6.1642
Epoch 9: Loss = 6.1148


#### 5.5. Xuất ra recommend cho 50 users trên, xuất ra csv

In [None]:
def recommend_for_multiple_users(model, edge_index, edge_weight, user_map, item_map, df, item_encoder, user_encoder, selected_users):
    model.eval()
    results = []

    with torch.no_grad():
        x = model(edge_index, edge_weight)
        user_emb = x[:len(user_map)]
        item_emb = x[len(user_map):]

        for user_id_full in selected_users:
            if user_id_full not in user_encoder.classes_:
                print(f"User {user_id_full} not in encoder!")
                continue
            user_id_encoded = user_encoder.transform([user_id_full])[0]
            if user_id_encoded not in user_map:
                print(f"User {user_id_full} not in user_map!")
                continue
            user_id = user_map[user_id_encoded]
            reviewer_name = df_merged_valid[df_merged_valid['reviewerID'] == user_id_full]['reviewerName'].iloc[0]

            scores = (user_emb[user_id] @ item_emb.T).cpu().numpy()
            top_items = np.argsort(scores)[-5:][::-1]

            inv_item_map = {v: k for k, v in item_map.items()}
            recommended_item_ids = [inv_item_map[i] for i in top_items]
            recommended_asins = item_encoder.inverse_transform(recommended_item_ids)
            recommended_products = df[df['asin'].isin(recommended_asins)][['category', 'title', 'brand', 'price']].drop_duplicates()

            recommended_products['reviewerID'] = user_id_full
            recommended_products['reviewerName'] = reviewer_name
            results.append(recommended_products)

    final_results = pd.concat(results, ignore_index=True)
    final_results['price'] = final_results['price'].apply(lambda x: 'Unknown' if pd.isna(x) else f"{x:.2f}")
    final_results['brand'] = final_results['brand'].fillna('Unknown')
    final_results = final_results[['reviewerID', 'reviewerName', 'category', 'title', 'brand', 'price']]
    return final_results

In [None]:
recommendations = recommend_for_multiple_users(model_ultra, edge_index, edge_weight, user_map, item_map, df, item_encoder, user_encoder, selected_users)
recommendations = recommendations[~recommendations['reviewerName'].str.contains("Amazon Customer", na=False)]
recommendations.head()

Unnamed: 0,reviewerID,reviewerName,category,title,brand,price
0,A3HV1HIUNZ61D,Marcella Wright,"Appliances, Parts & Accessories, Range Parts &...",Corelle Coordinates by Reston Lloyd Square Gas...,Corelle,15.53
1,A3HV1HIUNZ61D,Marcella Wright,"Appliances, Parts & Accessories, Refrigerator ...",Whirlpool Part Number 9750641,Whirlpool,15.0
2,A3HV1HIUNZ61D,Marcella Wright,"Appliances, Parts & Accessories, Refrigerator ...","LG LT700P Refrigerator Water Filter, Filters u...",LG,49.15
3,A3HV1HIUNZ61D,Marcella Wright,"Appliances, Parts & Accessories, Dryer Parts &...",Whirlpool 3406107 Door Switch for Dryer,Whirlpool,8.93
4,A3HV1HIUNZ61D,Marcella Wright,"Appliances, Parts & Accessories, Humidifier Pa...",Integra Boost Medium 8 Gram Humidity Pack 62% ...,Integra Boost,11.25


In [None]:
recommendations.to_csv('recommendations.csv', index=False)