In [2]:
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import FAISS
import json
import numpy as np


In [26]:
with open('edinburgh-keywords_train.json','r',encoding='utf-8') as f:
    train_data = json.load(f)
with open('edinburgh-keywords_test.json','r',encoding='utf-8') as f:
    test_data = json.load(f)

In [27]:
listres = []
for kw, rests in train_data['np2rests'].items():
    listres.extend(rests.keys())
restaurant_set = list(dict.fromkeys(listres))  # preserve order, unique

embeddings = GPT4AllEmbeddings()      # implements embed_query & embed_documents
vector_store = FAISS.from_texts(
    texts=restaurant_set,
    embedding=embeddings,
)


In [28]:
def extract_users(info):
    users, map2kw = [], []
    for kw, us in info.items():
        for u in us:
            if u not in users:
                users.  append(u)
                map2kw.append([])
            map2kw[users.index(u)].append(kw)
    return users, map2kw


In [29]:
test_users, test_users2kw = extract_users(test_data['np2users'])

In [30]:
top_k = 5
all_recs = []
for user_kw_list in test_users2kw:
    # only keep first 10 keywords (you did this before)
    query = " ".join(user_kw_list[:10])
    docs = vector_store.similarity_search(query, k=top_k)
    # docs is a list of langchain.schema.Document
    recs = [d.page_content for d in docs]
    all_recs.append(recs)


In [31]:
for user, recs in zip(test_users, all_recs):
    print(f"User {user!r} → recommendations:", recs)

User 'yfXqZkU5iXE07GSHzdsQBA' → recommendations: ['MPHG8OrZLbwzHoumajRyFA', 'htZylXMCqWb5PWaW_hdkGQ', 'ui99AtCcTZRCHEfNi1mYew', 'YFZtDqYdz00E1RlI_Vo89w', '5qc4qIXNh7_nbtYhNEn1lQ']
User 'hutJzKEYHuVq6CP-XSARgg' → recommendations: ['2nQ2_o5pPUMByU86bDjOeg', 'MPHG8OrZLbwzHoumajRyFA', 'ntomYrGHGViYGBFmsBGRpQ', 'taqRnVCqoQmiDuKp4E6FkA', 'TDgV70xYZOR5CHOqlOIYyQ']
User '2UkZKQBZVuroUBKYs9WzeQ' → recommendations: ['ntomYrGHGViYGBFmsBGRpQ', 'taqRnVCqoQmiDuKp4E6FkA', '8PmbCVEjWGPouD1Tst6FsQ', 'Iexv0Un2MfGEjHk8ejL8Ww', 'rdRNl2QPoIbMwX9DREXYWA']
User 'hihNuqYNKDwAwHeGNURE_g' → recommendations: ['8PmbCVEjWGPouD1Tst6FsQ', 'yHWNPxccm4T3PV3mHTQGJw', 'ntomYrGHGViYGBFmsBGRpQ', 'cBHMUESPj4SNs65Xv6xWRA', 'SpTBdWHuqpX-fXpPANHsJA']
User 'fmpcYRlirLlHuH_R7U-mew' → recommendations: ['taqRnVCqoQmiDuKp4E6FkA', '5qc4qIXNh7_nbtYhNEn1lQ', 'qu840hf1-I08p3Lcvdycyw', '4LlSn5JnHiiBkQFRYMHqXA', 'PJnLYU26wYp6TedhQ8PF8Q']
User 'B3DNjsZ3gRRsWYro4auztg' → recommendations: ['JNTZInpqCdpdliedN9bpcQ', 'rQ6dpYJSZ8zPeBoV5nPvUQ'

In [32]:
from typing import List, Dict

def get_recs_with_positions(
    vector_store,
    query: str,
    embedding_fn,
    top_k: int = 10
) -> (List[str], List[str]):

    q_vec = np.array([embedding_fn.embed_query(query)], dtype="float32")
    # 2) search the raw faiss index
    distances, indices = vector_store.index.search(q_vec, top_k)
    indices = indices[0].tolist()
    
    candidates = []
    positions = []
    for idx in indices:
        # map FAISS index → docstore_id → Document → .page_content
        doc_id = vector_store.index_to_docstore_id[idx]
        doc = vector_store.docstore.search(doc_id)
        candidates.append(doc.page_content)
        positions.append(str(idx))
    
    return candidates, positions

def build_user_rec_dict(
    users: List[str],
    users2kw: List[List[str]],
    vector_store,
    embedding_fn,
    top_k: int = 10
) -> Dict[str, Dict]:

    recs = {}
    for user, kw_list in zip(users, users2kw):
        top_kw = kw_list[:10]
        query = " ".join(top_kw)
        candidates, positions = get_recs_with_positions(
            vector_store, query, embedding_fn, top_k=top_k
        )
        recs[user] = {
            "kw":        top_kw,
            "candidate": candidates,
            "positions": positions
        }
    return recs

rec_dict = build_user_rec_dict(
    test_users,
    test_users2kw,
    vector_store,
    embeddings,
    top_k=10
)

print(json.dumps(rec_dict, indent=2, ensure_ascii=False))

{
  "yfXqZkU5iXE07GSHzdsQBA": {
    "kw": [
      "flatmates",
      "food",
      "while",
      "menu",
      "door",
      "fresher week",
      "hall",
      "usual range",
      "chinese classic dishes",
      "crispy chilli shredded beef"
    ],
    "candidate": [
      "MPHG8OrZLbwzHoumajRyFA",
      "htZylXMCqWb5PWaW_hdkGQ",
      "ui99AtCcTZRCHEfNi1mYew",
      "YFZtDqYdz00E1RlI_Vo89w",
      "5qc4qIXNh7_nbtYhNEn1lQ",
      "Nu1yhfnJxvdHW70r8FNvrg",
      "b8-lt7cnQpcvHM6Qre-WZA",
      "7ALAn4YYj9n4UQXaXrCaIg",
      "XFC0k6QbLstWAJ8_ppIVYw",
      "YmCN2ZdfCQ5InqT8KaA3lw"
    ],
    "positions": [
      "915",
      "205",
      "108",
      "523",
      "826",
      "190",
      "53",
      "916",
      "220",
      "842"
    ]
  },
  "hutJzKEYHuVq6CP-XSARgg": {
    "kw": [
      "flatmates",
      "food",
      "while",
      "menu",
      "fried rice",
      "prawn crackers",
      "minutes",
      "good quality",
      "bit",
      "wait"
    ],
    "candidate": [
      

In [33]:
output_path = "LangChain_re_ranking(Edinburgh).json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(rec_dict, f, ensure_ascii=False, indent=2)

print(f"Saved recommendations to {output_path}")

Saved recommendations to LangChain_re_ranking(Edinburgh).json
