In [1]:
import os
from langchain_community.embeddings import GPT4AllEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embeddings = GPT4AllEmbeddings()
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [2]:
import os
import json
import numpy as np
import torch
from torch_geometric.data import Data
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import openai
import os
from dotenv import load_dotenv

load_dotenv()







True

In [3]:
from openai import OpenAI

In [None]:
client = OpenAI(
    # This is the default and can be omitted
    api_key = "",
)

def ask_gpt(messages: list[dict], model="gpt-4o") -> str:
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=512
    )
    return response.choices[0].message.content

# 2) System prompts (in English)
sys_res = (
    "Re-rank the list of restaurants based on the user’s preferences. "
    "Return only the restaurant names, separated by commas."
)

sys_explain = (
    "You are an expert analyst. "
    "Based on the user’s preferences and the ranking you provided, "
    "please give a brief explanation of why you produced that result."
)

sys_key = "Return only the restaurant names, separated by commas."

In [5]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')



In [6]:
with open('edinburgh-keywords_train.json', 'r', encoding="utf-8") as f:
    train_data = json.load(f)

keywords = list(train_data['np2count'].keys())
keyword_set = set(keywords)

def extract_users(info):
    l_user, user2kw = [], []
    for ii in info:
        lus = info[ii]
        for u in lus:
            if u not in l_user:
                l_user.append(u)
                user2kw.append([])
            idx = l_user.index(u)
            user2kw[idx].append(ii)
    return l_user, user2kw


In [7]:
train_users, train_users2kw = extract_users(train_data['np2users'])

In [8]:
restaurant_set = set()
listres = []
for kw in train_data['np2rests'].keys():
    listres.extend(train_data['np2rests'][kw].keys())
restaurant_set = set(listres)

keyword_set = list(keyword_set)
restaurant_set = list(restaurant_set)
restaurants = len(listres)
num_keywords = len(keyword_set)
num_restaurants = len(restaurant_set)
a = np.zeros((num_keywords, num_restaurants))

for kw in train_data['np2rests'].keys():
    for res in train_data['np2rests'][kw].keys():
        idx_kw = keyword_set.index(kw)
        idx_res = restaurant_set.index(res)
        a[idx_kw][idx_res] = 1


In [9]:
keyword_embeddings = model.encode(list(keyword_set))

In [10]:
# Load dữ liệu test
with open('edinburgh-keywords_test.json', 'r', encoding="utf-8") as r:
    test_data = json.load(r)

user_keywords = list(test_data['np2reviews'].keys())
user_keywords_list = list(user_keywords)


In [11]:
test_users, test_users2kw = extract_users(test_data['np2users'])
test_keywords = [kw for sublist in test_users2kw for kw in sublist]
test_keyword_embeddings = model.encode(test_keywords)
similarity_scores = cosine_similarity(test_keyword_embeddings, keyword_embeddings)


In [12]:
filtered_keywords = []
for i, user_kw in enumerate(test_users2kw):
    updated_user_kw = []
    for kw in user_kw:
        if kw not in keyword_set:
            test_idx = test_keywords.index(kw)
            sim_scores = similarity_scores[test_idx]
            best_match_idx = np.argmax(sim_scores)
            best_match_keyword = keyword_set[best_match_idx]
            updated_user_kw.append(best_match_keyword)
        else:
            updated_user_kw.append(kw)
    filtered_keywords.append(updated_user_kw)

In [13]:
test_users2kw = filtered_keywords

In [14]:
results = []
for kw in test_users2kw:
    t = np.zeros((1, len(keyword_set)))
    keywords_subset = kw[:10]
    for key in keywords_subset:
        if key in keyword_set:
            idx_kw = keyword_set.index(key)
            t[0][idx_kw] = 1
    R = np.dot(t, a)
    result = np.argsort(R[0])[::-1][:10]
    results.append(result)

In [15]:
sys_res_en = (
    "Please re-rank the list of restaurants based on the user’s preferences. "
    "Return only the restaurant names, separated by commas."
)

# System prompt for explanation
sys_explain_en = (
    "You are an expert analyst. "
    "Based on the user’s preferences and the ranking you provided, "
    "please give a brief explanation of why you produced this ranking."
)

In [16]:
def GPT_re_rank_with_explanation_en(user_id, candidate_restaurants, user_keywords):
    # Ranking prompt
    prompt_rank = (
        f"The user’s preferences are: {', '.join(user_keywords[:5])}.\n"
        f"Here are the candidate restaurants: {', '.join(candidate_restaurants[:5])}.\n"
        "Please re-rank these restaurants by how well they match the user’s preferences, "
        "and return only the names, separated by commas."
    )
    messages_rank = [
        {"role": "system", "content": sys_res_en},
        {"role": "user",   "content": prompt_rank},
    ]
    ranked_text = ask_gpt(messages_rank)
    re_ranked = [r.strip() for r in ranked_text.split(",") if r.strip() in candidate_restaurants]
    if not re_ranked:
        re_ranked = candidate_restaurants

    # Explanation prompt
    prompt_explain = (
        f"You have ranked the restaurants in this order: {', '.join(re_ranked)}.\n"
        f"The user’s preferences are: {', '.join(user_keywords[:5])}.\n"
        "Please provide a concise explanation of why this ranking is most appropriate."
    )
    messages_explain = [
        {"role": "system", "content": sys_explain_en},
        {"role": "user",   "content": prompt_explain},
    ]
    explanation = ask_gpt(messages_explain)

    return re_ranked, explanation

# Example usage:
final_results = []
final_explanations = []
for idx, (user, candidate_indices) in enumerate(zip(test_users, results)):
    candidate_restaurants = [restaurant_set[i] for i in candidate_indices]
    user_kw = test_users2kw[idx]
    ranked, reason = GPT_re_rank_with_explanation_en(user, candidate_restaurants, user_kw)
    final_results.append(ranked)
    final_explanations.append(reason)
    print(f"User {user} → Ranking: {ranked}")
    print(f"Explanation: {reason}\n")

User yfXqZkU5iXE07GSHzdsQBA → Ranking: ['xbCNlp2vWixy2W1WsfGv8g', 'QMySP-qHdwQNZY0gDba0ng', '8PmbCVEjWGPouD1Tst6FsQ', 'EKEPtNKJdZxRhYoFhuG8ew', 'V0tRCjf95FTqVJvmxdeqbQ']
Explanation: The ranking of the restaurants was determined based on how well each one aligns with the user's preferences. Here is a brief explanation for the order:

1. **xbCNlp2vWixy2W1WsfGv8g**: This restaurant likely offers a menu that greatly appeals to those dining with flatmates, featuring shareable options and a variety of dishes to cater to diverse tastes. Combined with a convenient location or services like delivery ("door"), it stands out strongly among the options.

2. **QMySP-qHdwQNZY0gDba0ng**: Known for an impressive menu and quality food, this restaurant ranks high due to its appeal to food enthusiasts. It is also likely accessible, making it an easy choice for a group visit with flatmates.

3. **8PmbCVEjWGPouD1Tst6FsQ**: This option likely has a solid menu with enough variety to satisfy the user and the

In [17]:
# Hàm trả về kết quả dạng dictionary để sử dụng cho hệ thống
def generate_results(test_users, results, test_users2kw, restaurant_set, re_ranked):
    output_data = {}
    for idx, (user, restaurant_indices) in enumerate(zip(test_users, results)):
        user_data = {}
        user_keywords = test_users2kw[idx]
        candidate_restaurants = [restaurant_set[i] for i in restaurant_indices]
        re_ranked_restaurants = re_ranked[idx]
        positions = [str(i) for i in restaurant_indices]
        user_data["kw"] = user_keywords[:5]
        user_data["candidate"] = re_ranked_restaurants[:5]
        user_data["positions"] = positions[:5]
        output_data[user] = user_data
    return output_data 

result_dict = generate_results(test_users, results, test_users2kw, restaurant_set, final_results)
print(result_dict)

{'yfXqZkU5iXE07GSHzdsQBA': {'kw': ['flatmates', 'food', 'while', 'menu', 'door'], 'candidate': ['xbCNlp2vWixy2W1WsfGv8g', 'QMySP-qHdwQNZY0gDba0ng', '8PmbCVEjWGPouD1Tst6FsQ', 'EKEPtNKJdZxRhYoFhuG8ew', 'V0tRCjf95FTqVJvmxdeqbQ'], 'positions': ['291', '612', '911', '814', '826']}, 'hutJzKEYHuVq6CP-XSARgg': {'kw': ['flatmates', 'food', 'while', 'menu', 'fried rice'], 'candidate': ['5Iin0tl6QJT_TyL1BldtyA', 'aIs7Tt3pw6ymj3FOPPR4Ng', 'adI5BlJCtvWq2uAwmyndXA', 'hJNjKc4sKgWG__KitlJ1ag', 'sxuEde0_sVFY-Uk9gpHy_g'], 'positions': ['637', '509', '647', '692', '214']}, '2UkZKQBZVuroUBKYs9WzeQ': {'kw': ['food', 'menu', 'minutes', 'wait', 'restaurant'], 'candidate': ['I1j1kEbQ5BliKsjzl5etVA', 'Q0fcX_1wvdmffqEPa246rg', 'xbCNlp2vWixy2W1WsfGv8g', 'J_KI8rJa_dSwTglUbwFcdw', 'BmrVL3Aj5usPfpu5DE1iCg'], 'positions': ['335', '140', '612', '567', '600']}, 'hihNuqYNKDwAwHeGNURE_g': {'kw': ['food', 'spicy sauce', 'meal', 'more time', 'queen'], 'candidate': ['I1j1kEbQ5BliKsjzl5etVA', 'Q0fcX_1wvdmffqEPa246rg', 'ZOyB

In [46]:
# Hàm lưu kết quả vào file JSON để kiểm tra nhanh
def save_rerank_results_to_json(test_users, results, test_users2kw, restaurant_set, re_ranked, file_path='./data/Output_GPT(re_rank).json'):
    output_data = {}
    for idx, (user, restaurant_indices) in enumerate(zip(test_users, results)):
        user_data = {}  
        user_keywords = test_users2kw[idx]
        candidate_restaurants = [restaurant_set[i] for i in restaurant_indices]
        re_ranked_restaurants = re_ranked[idx]
        positions = [str(i) for i in restaurant_indices]
        user_data["kw"] = user_keywords[:10]
        user_data["candidate"] = re_ranked_restaurants[:10]
        user_data["positions"] = positions[:10]
        output_data[user] = user_data
    with open(file_path, mode="w", encoding="utf-8") as json_file:
        json.dump(output_data, json_file, ensure_ascii=False, indent=4)
    print(f"Results saved to: {file_path}")

save_rerank_results_to_json(test_users, results, test_users2kw, restaurant_set, final_results)


Results saved to: ./data/Output_GPT(re_rank).json


In [18]:
def generate_full_results(test_users, results, test_users2kw, restaurant_set, re_ranked, explanations):
    output_data = {}
    for idx, user in enumerate(test_users):
        kws = test_users2kw[idx][:5]
        cand = re_ranked[idx][:5]
        # orig = [restaurant_set[i] for i in results[idx]][:5]
        exp = explanations[idx]
        output_data[user] = {
            "kw": kws,
            "candidate": cand,
            "explanation": exp
        }
    return output_data

generate_full_results(test_users, results, test_users2kw, restaurant_set, final_results, final_explanations)

{'yfXqZkU5iXE07GSHzdsQBA': {'kw': ['flatmates',
   'food',
   'while',
   'menu',
   'door'],
  'candidate': ['xbCNlp2vWixy2W1WsfGv8g',
   'QMySP-qHdwQNZY0gDba0ng',
   '8PmbCVEjWGPouD1Tst6FsQ',
   'EKEPtNKJdZxRhYoFhuG8ew',
   'V0tRCjf95FTqVJvmxdeqbQ'],
  'explanation': 'The ranking of the restaurants was determined based on how well each one aligns with the user\'s preferences. Here is a brief explanation for the order:\n\n1. **xbCNlp2vWixy2W1WsfGv8g**: This restaurant likely offers a menu that greatly appeals to those dining with flatmates, featuring shareable options and a variety of dishes to cater to diverse tastes. Combined with a convenient location or services like delivery ("door"), it stands out strongly among the options.\n\n2. **QMySP-qHdwQNZY0gDba0ng**: Known for an impressive menu and quality food, this restaurant ranks high due to its appeal to food enthusiasts. It is also likely accessible, making it an easy choice for a group visit with flatmates.\n\n3. **8PmbCVEjWGPouD

In [None]:
def save_full_results_to_json(test_users, results, test_users2kw, restaurant_set, re_ranked, explanations, 
                              file_path='./data/Singapore_.json'):
    output_data = generate_full_results(
        test_users, results, test_users2kw, restaurant_set, re_ranked, explanations
    )
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, mode="w", encoding="utf-8") as f:
        json.dump(output_data, f, ensure_ascii=False, indent=4)
    print(f"Full results saved to: {file_path}")

save_full_results_to_json(test_users, results, test_users2kw, restaurant_set, final_results, final_explanations)



Full results saved to: ./data/Output_GPT(explanation).json


In [None]:
recommended_df = pd.DataFrame({
    "user": test_users,
    "restaurant_names": [", ".join(ranks[:10]) for ranks in final_results]
})

In [None]:
#Extract groundtruth from test data
user_ground_truth = {}
np2rests = test_data['np2rests']
np2users = test_data['np2users']
for keyword, restaurants in np2rests.items():
    users = np2users.get(keyword, [])
    for user in users:
        user_ground_truth.setdefault(user, set()).update(restaurants.keys())

In [None]:
user_ground_truth = {u: list(v) for u, v in user_ground_truth.items()}

In [None]:
def precision_recall_at_k(actual, recommended, k):
    recommended_at_k = recommended[:k]
    hits = len(set(actual) & set(recommended_at_k))
    precision = hits / k    
    recall = hits / len(actual) if actual else 0.0
    return precision, recall

def average_precision(actual, recommended, k):
    recommended_at_k = recommended[:k]
    score = 0.0
    hits = 0
    for i, r in enumerate(recommended_at_k):
        if r in actual:
            hits += 1
            score += hits / (i + 1)
    return score / min(len(actual), k) if actual else 0.0

NameError: name 'pd' is not defined

In [None]:
k_values = [5, 10, 20]
precision_scores = {k: [] for k in k_values}
recall_scores    = {k: [] for k in k_values}
avg_precision_scores = []

for _, row in recommended_df.iterrows():
    user = row["user"]
    recommended = row["restaurant_names"].split(", ")
    actual = user_ground_truth.get(user, [])
    
    for k in k_values:
        p, r = precision_recall_at_k(actual, recommended, k)
        precision_scores[k].append(p)
        recall_scores[k].append(r)
    
    # MAP@20
    ap20 = average_precision(actual, recommended, 20)
    avg_precision_scores.append(ap20)

print("Recommendation Quality")
for k in k_values:
    print(f"Precision@{k}: {np.mean(precision_scores[k]):.4f}")
    print(f"Recall@{k}:    {np.mean(recall_scores[k]):.4f}")
MAP = np.mean(avg_precision_scores)
print(f"Mean Average Precision (MAP@20): {MAP:.4f}")

unique_recommended = set()
for names in recommended_df["restaurant_names"]:
    unique_recommended.update(names.split(", "))

all_restaurants = set().union(*[set(d.keys()) for d in test_data['np2rests'].values()])
coverage = len(unique_recommended) / len(all_restaurants) * 100
print(f"Recommendation Coverage: {coverage:.2f}%")