<a href="https://colab.research.google.com/github/Mancupfire/Retrieval_Model/blob/main/Retrieval_Model(sBERT).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch_geometric
!pip install transformers

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [28]:
import json
import numpy as np
import torch
from torch_geometric.data import Data
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [29]:
# Load Sentence-BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [30]:
# Đọc dữ liệu huấn luyện
with open('edinburgh-keywords_train.json', 'r') as f:
    train_data = json.load(f)

keywords = list(train_data['np2count'].keys())

In [31]:
# Loại bỏ những từ bị trùng
keyword_set = set(keywords)

In [32]:
keywords[:10]

['good',
 'chinese takeaway',
 'ribs',
 'great starters',
 'mains',
 'kung',
 'decent portion',
 'way',
 'thai food',
 'edinburgh']

In [33]:
def extract_users(info):
    l_user, user2kw = [], []
    for ii in info:
        lus = info[ii]
        for u in lus:
            if u not in l_user:
                l_user.append(u)
                user2kw.append([])
            idx = l_user.index(u)
            user2kw[idx].append(ii)
    return l_user, user2kw

In [34]:
train_users, train_users2kw = extract_users(train_data['np2users'])

In [35]:
restaurant_set = set()
listres = []
for kw in train_data['np2rests'].keys():
    listres.extend(train_data['np2rests'][kw].keys())
restaurant_set = set(listres)

In [36]:
# Tạo ma trận liên kết từ keywords và restaurant
keyword_set = list(keyword_set)
restaurant_set = list(restaurant_set)
restaurants = len(listres)
num_keywords = len(keyword_set)
num_restaurants = len(restaurant_set)
a = np.zeros((num_keywords, num_restaurants))

for kw in train_data['np2rests'].keys():
    for res in train_data['np2rests'][kw].keys():
        idx_kw = keyword_set.index(kw)
        idx_res = restaurant_set.index(res)
        a[idx_kw][idx_res] = 1

In [39]:
# Mã hóa các từ khóa trong tập huấn luyện
keyword_embeddings = model.encode(list(keyword_set))

In [41]:
# Đọc dữ liệu kiểm tra
with open('edinburgh-keywords_test.json', 'r') as r:
    test_data = json.load(r)

user_keywords = list(test_data['np2reviews'].keys())
user_keywords_list = list(user_keywords)

In [43]:
test_users, test_users2kw = extract_users(test_data['np2users'])

In [44]:
# Mã hóa các từ khóa trong tập kiểm tra
test_keywords = [kw for sublist in test_users2kw for kw in sublist]
test_keyword_embeddings = model.encode(test_keywords)

In [45]:
# Tính toán độ tương đồng giữa từ khóa kiểm tra và từ khóa huấn luyện
similarity_scores = cosine_similarity(test_keyword_embeddings, keyword_embeddings)


In [47]:
filtered_keywords = []
for i, user_kw in enumerate(test_users2kw):
    updated_user_kw = []
    for kw in user_kw:
        if kw not in keyword_set:
            # Lấy vị trí của test keyword trong ma trận similarity_scores
            test_idx = test_keywords.index(kw)
            sim_scores = similarity_scores[test_idx]

            # Tìm keyword trong traindata có cosine similarity lớn nhất
            best_match_idx = np.argmax(sim_scores)
            best_match_keyword = keyword_set[best_match_idx]

            # Thay thế nhau
            updated_user_kw.append(best_match_keyword)
        else:
            updated_user_kw.append(kw)

In [49]:
# Cập nhật danh sách từ khóa người dùng
filtered_keywords.append(updated_user_kw)

In [51]:
# Cập nhật lại test_users2kw sau khi thay thế
test_users2kw = filtered_keywords

In [52]:
for kw in test_users2kw:
    t = np.zeros((1, len(keyword_set)))
    keywords = kw[:10]
    for keys in keywords:
        if keys in keyword_set:
            idx_kw = keyword_set.index(keys)
            t[0][idx_kw] = 1
    R = np.dot(t, a)
    result = np.argsort(R[0])[::-1][:10]

In [53]:
result

array([399, 887, 208,  80, 740, 159, 564, 534, 171, 436])

In [55]:
def precision_at_k(predictions, ground_truth, k):
    """
    Calculate Precision@k for a single user.
    """
    relevant_items = set(ground_truth)
    top_k_predictions = set(predictions[:k])
    return len(relevant_items.intersection(top_k_predictions)) / k

def recall_at_k(predictions, ground_truth, k):
    """
    Calculate Recall@k for a single user.
    """
    relevant_items = set(ground_truth)
    top_k_predictions = set(predictions[:k])
    return len(relevant_items.intersection(top_k_predictions)) / len(relevant_items)

def f1_at_k(predictions, ground_truth, k):
    """
    Calculate F1@k for a single user.
    """
    precision = precision_at_k(predictions, ground_truth, k)
    recall = recall_at_k(predictions, ground_truth, k)
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

In [57]:
k = 10
precision_scores = []
recall_scores = []
f1_scores = []

for i, user_kw_list in enumerate(test_users2kw):
    ground_truth = set()
    for user_kw in user_kw_list:
        ground_truth.update(test_data['np2rests'].get(user_kw, {}).keys())

    if not ground_truth:
        continue

    # Generate ra top-k recommendations
    t = np.zeros((1, len(keyword_set)))
    keywords = user_kw_list[:10]  # Top 10 keywords cho user
    for keys in keywords:
        if keys in keyword_set:
            idx_kw = keyword_set.index(keys)
            t[0][idx_kw] = 1

    R = np.dot(t, a)
    result = np.argsort(R[0])[::-1][:k]

    predicted_restaurants = [restaurant_set[idx] for idx in result]

    # Evaluate metrics
    precision_scores.append(precision_at_k(predicted_restaurants, ground_truth, k))
    recall_scores.append(recall_at_k(predicted_restaurants, ground_truth, k))
    f1_scores.append(f1_at_k(predicted_restaurants, ground_truth, k))

avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)

print(f"Precision@{k}: {avg_precision:.4f}")
print(f"Recall@{k}: {avg_recall:.4f}")
print(f"F1@{k}: {avg_f1:.4f}")

Precision@10: 0.6000
Recall@10: 0.0228
F1@10: 0.0440
