In [1]:
import json
import numpy as np
import torch
import csv
from torch_geometric.data import Data
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt




In [2]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [3]:
with open('singapore-keywords_train.json', 'r') as f:
    train_data = json.load(f)

keywords = list(train_data['np2count'].keys())

keyword_set = set(keywords)

In [4]:
def extract_users(info):
    l_user, user2kw = [], []
    for ii in info:
        lus = info[ii]
        for u in lus:
            if u not in l_user:
                l_user.append(u)
                user2kw.append([])
            idx = l_user.index(u)
            user2kw[idx].append(ii)
    return l_user, user2kw


In [5]:
train_users, train_users2kw = extract_users(train_data['np2users'])

In [6]:
restaurant_set = set()
listres = []
for kw in train_data['np2rests'].keys():
    listres.extend(train_data['np2rests'][kw].keys())
restaurant_set = set(listres)

keyword_set = list(keyword_set)
restaurant_set = list(restaurant_set)
restaurants = len(listres)
num_keywords = len(keyword_set)
num_restaurants = len(restaurant_set)
a = np.zeros((num_keywords, num_restaurants))

In [7]:
for kw in train_data['np2rests'].keys():
    for res in train_data['np2rests'][kw].keys():
        idx_kw = keyword_set.index(kw)
        idx_res = restaurant_set.index(res)
        a[idx_kw][idx_res] = 1

In [8]:
keyword_embeddings = model.encode(list(keyword_set))

In [9]:
with open('singapore-keywords_test.json', 'r') as r:
    test_data = json.load(r)

user_keywords = list(test_data['np2reviews'].keys())
user_keywords_list = list(user_keywords)

In [10]:
test_users, test_users2kw = extract_users(test_data['np2users'])

In [11]:
test_keywords = [kw for sublist in test_users2kw for kw in sublist]
test_keyword_embeddings = model.encode(test_keywords)

similarity_scores = cosine_similarity(test_keyword_embeddings, keyword_embeddings)


In [12]:
filtered_keywords = []
for i, user_kw in enumerate(test_users2kw):
    updated_user_kw = []
    for kw in user_kw:
        if kw not in keyword_set:
            test_idx = test_keywords.index(kw)
            sim_scores = similarity_scores[test_idx]

            best_match_idx = np.argmax(sim_scores)
            best_match_keyword = keyword_set[best_match_idx]

            updated_user_kw.append(best_match_keyword)
        else:
            updated_user_kw.append(kw)

    filtered_keywords.append(updated_user_kw)

test_users2kw = filtered_keywords

In [13]:
results = []
for kw in test_users2kw:
    t = np.zeros((1, len(keyword_set)))
    keywords = kw[:10]
    for keys in keywords:
        if keys in keyword_set:
            idx_kw = keyword_set.index(keys)
            t[0][idx_kw] = 1
    R = np.dot(t, a)
    result = np.argsort(R[0])[::-1][:10]
    results.append(result)

In [14]:
result

array([843, 730,  19, 743, 224, 279, 500, 238, 911, 554], dtype=int64)

In [18]:

if __name__ == "__main__":
        for i, result in enumerate(results):
            restaurant_names = [restaurant_set[idx] for idx in result]
            print(f"The result for user {i} is: {restaurant_names}")


The result for user 0 is: ['2tGFUtUrE0DhwlX59pbArA', 'fY1IkBnRft1KR0O2tqu7pg', 'MPzZuWpKpeLBa833zzL9IQ', 'izEBByeNB835I5makSbdew', 'fxQKJjAs7sayW3Zg45kE_g', 'SxT7tgTNxkVuk67XEl7p8g', '-vVXB-MBTTa8EUmJhDafQQ', 'J9rMt_V1NX49rU3YUjh_7Q', '69-vLz9AF84FsJmX3cpU9g', 'N8KDtnIu6x0uQ-JogBNeAg']
The result for user 1 is: ['n2CnBRKK82cWZ9a2OjS2xQ', 'vVqxGrqt5ALxQjJGnntpKQ', 'VEy3-SnvsKgWfGM_b2irbA', 'Rwzp59f3Ia-v2V8Ku0zZWQ', 'x44o1_Dw4QsQZRtBJgwZaA', '7LFdimtQU3yrgiKyE53klQ', '0W5UWAcjwGuCJOv8rXGMqw', 'lycfBKtcR7keDQQOeMMsoA', 'EJNkmXMDWurCPjOZe_WXAg', 'f6SN9YUSpb9AjS4hnFgyxg']
The result for user 2 is: ['_HMiq7vSET4uhNScBdOJnw', 'Rwzp59f3Ia-v2V8Ku0zZWQ', 'ymAw53bQVppVV7T40fNgRw', '_7nMWBmfFvk9S3tKG3Ip_g', 'LFknVw6uTIaL-MY_RqkN7g', 'J9rMt_V1NX49rU3YUjh_7Q', 'BYwoAKKtdSeylSN7QV8oNw', 'I1wArJIYbFl2txUL6unC6A', 'BFT_7kFOCq8FjoINnxycLQ', 'txwpCDw5Utu9WbM0VtYJqg']
The result for user 3 is: ['YnnYwfMEeSru_cJ_qKeofw', 'LgCFgXhmPU5pT38BqlVC5Q', 'WlT8rU9YYRjGSG-gX_hnCg', '7LFdimtQU3yrgiKyE53klQ', 'vVqxGrq

In [28]:
csv_file_path = "./result/results.csv"

with open(csv_file_path, mode="w", newline="") as file:
    fieldnames = ['number', 'user', 'restaurant_name']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    number = 1
    for user, restaurant_indices in zip(test_users, results):
        for idx in restaurant_indices:
            restaurant_name = restaurant_set[idx]
            writer.writerow({'number': number, 'user': user, 'restaurant_name': restaurant_name})
            number += 1

print(f"\nResults saved to: {csv_file_path}")


Results saved to: ./result/results.csv


In [19]:
restaurant_name = restaurant_set[idx]

NameError: name 'restaurant_name' is not defined