In [1]:
import os
import torch
import numpy as np
from golemai.nlp.llm_resp_gen import LLMRespGen
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv

load_dotenv()
os.environ['HF_HOME'] = '/net/tscratch/people/plgkonkie311/cache/'

In [2]:
cd spatial

/net/tscratch/people/plgkonkie311/spatial


In [3]:
from prompts import PROMPT_NER_DISTANCE_WHERE, PROMPT_FIND_LATITUDE_LONGITUDE, PROMPT_END

In [4]:
EMBEDDER_ID="intfloat/multilingual-e5-large-instruct"

In [5]:
load_dotenv()
api_key = os.getenv("CLARIN_KEY")

In [6]:
llm_rg = LLMRespGen(
    id_col='id',
    model_type='api',
    system_msg='You are a helpful assistant.',
    prompt_template='',
    batch_size=1,
    api_url='https://services.clarin-pl.eu/api/v1/oapi',
    api_key=api_key,
    timeout=200,
).set_generation_config(
    model_id='mixtral-8x22B',

)


Model type is 'api'. Setting generation config for API.


In [7]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDER_ID)

location_vector_store = FAISS.load_local(
    'all_wroclaw_new',
    embeddings,
    allow_dangerous_deserialization=True,
)

2025-01-27 14:37:14.664397: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737985034.788559 3445372 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737985034.850351 3445372 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-27 14:37:15.117012: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
import textwrap
TEXT_WRAP_WIDTH = 120

def generate_response(QUERY, verbose=False, **kwargs):
    prompts = llm_rg.prepare_prompt(
        query=QUERY,
        **kwargs,
    )

    if verbose:
        print("\n".join(textwrap.wrap(prompts, width=TEXT_WRAP_WIDTH)))
        print()

    result = llm_rg._generate_llm_response(
        inputs=prompts,
    )

    if verbose:
        print("\n".join(textwrap.wrap(result, width=TEXT_WRAP_WIDTH)))

    return result

def extract_entities_from_documents(query, documents):
    llm_rg.set_prompt_template(PROMPT_NER_DISTANCE_WHERE)

    result = generate_response(query, verbose=True)

    location = result.split("`LOCATION_VECTOR_DB_PROMPT`:")[-1].split("`PLACE_VECTOR_DB_PROMPT`")[0].strip()
    place = result.split("`PLACE_VECTOR_DB_PROMPT`:")[-1].split("`DISTANCE`")[0].strip()
    distance = result.split("`DISTANCE`:")[-1].split("km")[0].strip()
    distance = float(distance) if distance.isdigit() else distance

    return location, place, distance


def extract_lat_lon_from_documents(query, documents):
    llm_rg.set_prompt_template(PROMPT_FIND_LATITUDE_LONGITUDE)

    result = generate_response(query, documents=documents, verbose=True)

    latitude = result.split("`LATITUDE`:")[-1].split("`LONGITUDE`")[0].strip()
    longitude = result.split("`LONGITUDE`:")[-1].strip()
    latitude, longitude = float(latitude), float(longitude)

    return latitude, longitude

def summarize_results(query, documents):
    llm_rg.set_prompt_template(PROMPT_END)

    result = generate_response(query, documents=documents, verbose=True)

    return result

def documents_to_string(results):
    return "\n".join([f"{i}. Metadata: {doc.metadata}\n   Rest: {doc.page_content}" for i, doc in enumerate(results)])


def generate_square_filter(center, radius_km=5, amenity=None):
    lat, lon = center
    
    # Approximate degrees per kilometer for latitude and longitude
    lat_delta = radius_km / 110.574  # 1 degree latitude = 110.574 km
    lon_delta = radius_km / (111.320 * np.cos(np.radians(lat)))  # 1 degree longitude = 111.320 km * cos(latitude)

    # Create the bounding box as a filter
    filter_query = {
        "$and": [
            {"lattitude": {"$gte": lat - lat_delta}},  # min latitude
            {"lattitude": {"$lte": lat + lat_delta}},  # max latitude
            {"longitude": {"$gte": lon - lon_delta}},  # min longitude
            {"longitude": {"$lte": lon + lon_delta}},  # max longitude
        ]
    }
    if amenity:
        filter_query["$and"].append({"amenity": amenity})
    return filter_query

In [9]:
# QUERY = "Find me a Polish restaurant near ulica Legnicka 65, Wrocław"

QUERY = "Find me a place where i can buy a good coffee near to plac Grunwaldzki, Wrocław"

In [10]:
location, place, distance = extract_entities_from_documents(QUERY, None)

 Please extract the following fields from the user's query:  1. `LOCATION_VECTOR_DB_PROMPT`: Provide a representation of
the location mentioned in the user's query. This should correspond to a specific place or landmark (e.g., "Dworzec
Wrocław Główny"). It should have been extracted in OSM format, e.g., 'amenity': 'restaurant', 'cuisine': 'polish'. if
you can find address also pass it with prefix addr:city, addr:street, addr:housenumber, addr:postcode.     2.
`PLACE_VECTOR_DB_PROMPT`: Provide a representation of the type of place or business mentioned in the query (e.g.,
"restaurant", "hotel", etc.). In the given example, it would be a "Polish restaurant". It should have been extracted in
OSM format, e.g., 'amenity': 'restaurant', 'cuisine': 'polish'.  3. `DISTANCE`: If the query contains information about
the proximity or distance, either explicitly or implicitly, include it. This could be a specific distance (e.g., "5 km
away") or a general description (e.g., "near"). If available if

In [11]:
location, place, distance

("'name': 'plac Grunwaldzki', 'description': 'square', 'addr:city': 'Wrocław'",
 "'amenity': 'cafe'",
 1.0)

In [12]:
results = location_vector_store.similarity_search(
    location,
    k=5,
)

results

[Document(id='660d15c3-d6b9-4427-b02e-511770e4fce4', metadata={'amenity': 'cafe', 'name': 'Gorąco Polecam', 'address': 'plac Grunwaldzki 22, Wrocław', 'lattitude': 51.1129748, 'longitude': 17.0596745}, page_content='{"addr:city": "Wroc\\u0142aw", "addr:housenumber": "22", "addr:postcode": "50-363", "addr:street": "plac Grunwaldzki", "amenity": "cafe", "level": "0", "name": "Gor\\u0105co Polecam", "shop": "bakery"}'),
 Document(id='438b9138-5484-4c38-976b-06bf46a0deee', metadata={'amenity': 'restaurant', 'name': 'Bulvarova', 'address': 'Rybacka 23, Wrocław', 'lattitude': 51.1155595, 'longitude': 17.0184947}, page_content='{"addr:city": "Wroc\\u0142aw", "addr:housenumber": "23", "addr:postcode": "53-656", "addr:street": "Rybacka", "amenity": "restaurant", "cuisine": "pizza;international", "name": "Bulvarova", "website": "https://bulvarova.pl"}'),
 Document(id='270bbfb7-7410-4f57-85ed-cf303a1904fe', metadata={'amenity': 'cafe', 'name': 'Noon', 'address': 'Henryka Sienkiewicza 20B, Wrocław

In [13]:
documents = documents_to_string(results)


latitude, longitude = extract_lat_lon_from_documents(location, documents)

 From provided Documents, find the latitude and longitude of the location mentioned in the user's query. If it is not
explicitly mentioned, try to extract it from the metadata of the documents. It can be near the location mentioned in the
query.  QUERY: "'name': 'plac Grunwaldzki', 'description': 'square', 'addr:city': 'Wrocław'"  DOCUMENTS: 0. Metadata:
{'amenity': 'cafe', 'name': 'Gorąco Polecam', 'address': 'plac Grunwaldzki 22, Wrocław', 'lattitude': 51.1129748,
'longitude': 17.0596745}    Rest: {"addr:city": "Wroc\u0142aw", "addr:housenumber": "22", "addr:postcode": "50-363",
"addr:street": "plac Grunwaldzki", "amenity": "cafe", "level": "0", "name": "Gor\u0105co Polecam", "shop": "bakery"} 1.
Metadata: {'amenity': 'restaurant', 'name': 'Bulvarova', 'address': 'Rybacka 23, Wrocław', 'lattitude': 51.1155595,
'longitude': 17.0184947}    Rest: {"addr:city": "Wroc\u0142aw", "addr:housenumber": "23", "addr:postcode": "53-656",
"addr:street": "Rybacka", "amenity": "restaurant", "cuisine

In [14]:
latitude, longitude, place

(51.1139291, 17.06995, "'amenity': 'cafe'")

In [15]:
results = location_vector_store.similarity_search(
    place,
    k=5,
    filter=generate_square_filter((latitude, longitude), radius_km=distance),
)


results

[Document(id='3daa421d-33ed-42cc-907a-5dafa50ffbb2', metadata={'amenity': 'cafe', 'name': 'Dodo Cafe', 'address': 'Unknown , Unknown', 'lattitude': 51.1053122, 'longitude': 17.0745833}, page_content='{"amenity": "cafe", "name": "Dodo Cafe", "street_vendor": "yes"}')]

In [16]:
documents = documents_to_string(results)

answer = summarize_results(QUERY, documents)

 Provide short suummarization based on user query and the documents provided. Recommend the most relevant document.
Write it short and concise.  QUERY: "Find me a place where i can buy a good coffee near to plac Grunwaldzki, Wrocław"
DOCUMENTS: 0. Metadata: {'amenity': 'cafe', 'name': 'Dodo Cafe', 'address': 'Unknown , Unknown', 'lattitude':
51.1053122, 'longitude': 17.0745833}    Rest: {"amenity": "cafe", "name": "Dodo Cafe", "street_vendor": "yes"}
RECOMMENDATION:

 Recommendation: Dodo Cafe is a suitable option for you. It's a cafe located near Plac Grunwaldzki in Wrocław. However,
the exact address is unknown. It's worth noting that it's a street vendor, so you might find it in the vicinity of Plac
Grunwaldzki.


In [17]:
print('\n'.join(textwrap.wrap(answer, width=TEXT_WRAP_WIDTH)))

 Recommendation: Dodo Cafe is a suitable option for you. It's a cafe located near Plac Grunwaldzki in Wrocław. However,
the exact address is unknown. It's worth noting that it's a street vendor, so you might find it in the vicinity of Plac
Grunwaldzki.


# Booksy

In [18]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDER_ID)

booksy_vector_store = FAISS.load_local(
    'booksy_wroclaw_new',
    embeddings,
    allow_dangerous_deserialization=True,
)

In [19]:
# QUERY = "Find me a hairdresser that offers hair coloring near to plac Grunwaldzki, Wrocław"

QUERY = "Find ma a hairdresser that costs less than 100 PLN for a men's haircut near to Galeria Dominikańska, Wrocław"

In [20]:
location, place, distance = extract_entities_from_documents(QUERY, None)

 Please extract the following fields from the user's query:  1. `LOCATION_VECTOR_DB_PROMPT`: Provide a representation of
the location mentioned in the user's query. This should correspond to a specific place or landmark (e.g., "Dworzec
Wrocław Główny"). It should have been extracted in OSM format, e.g., 'amenity': 'restaurant', 'cuisine': 'polish'. if
you can find address also pass it with prefix addr:city, addr:street, addr:housenumber, addr:postcode.     2.
`PLACE_VECTOR_DB_PROMPT`: Provide a representation of the type of place or business mentioned in the query (e.g.,
"restaurant", "hotel", etc.). In the given example, it would be a "Polish restaurant". It should have been extracted in
OSM format, e.g., 'amenity': 'restaurant', 'cuisine': 'polish'.  3. `DISTANCE`: If the query contains information about
the proximity or distance, either explicitly or implicitly, include it. This could be a specific distance (e.g., "5 km
away") or a general description (e.g., "near"). If available if

In [21]:
location, place, distance

("'name': 'Galeria Dominikańska', 'description': 'shopping mall', 'addr:city': 'Wrocław'",
 "'shop': 'hairdresser', 'price': 'less than 100 PLN for a men's haircut'",
 1.0)

In [22]:
results = location_vector_store.similarity_search(
    location,
    k=5,
)

results

[Document(id='f7fd476d-a60b-4dd2-82da-3d8397e3dd1c', metadata={'amenity': 'marketplace', 'name': 'C.H. TARGET', 'address': 'Henryka Michała Kamieńskiego 14, Wrocław', 'lattitude': 51.1412245, 'longitude': 17.0343133}, page_content='{"addr:city": "Wroc\\u0142aw", "addr:housenumber": "14", "addr:postcode": "51-124", "addr:street": "Henryka Micha\\u0142a Kamie\\u0144skiego", "amenity": "marketplace", "name": "C.H. TARGET"}'),
 Document(id='247ba7fe-a420-45e5-8f01-feae96f9e435', metadata={'amenity': 'monastery', 'name': 'Refektarz', 'address': 'plac Dominikański 2-4, Wrocław', 'lattitude': 51.1092091, 'longitude': 17.0403065}, page_content='{"addr:city": "Wroc\\u0142aw", "addr:country": "PL", "addr:housenumber": "2-4", "addr:postcode": "50-159", "addr:street": "plac Dominika\\u0144ski", "addr:street:sym_ul": "04103", "amenity": "monastery", "image": "https://photos.app.goo.gl/PnhinPXQh9QtAxwYA", "name": "Refektarz"}'),
 Document(id='61bd736b-d450-4568-9099-0815b8e4ea49', metadata={'amenity

In [23]:
documents = documents_to_string(results)

latitude, longitude = extract_lat_lon_from_documents(location, documents)

 From provided Documents, find the latitude and longitude of the location mentioned in the user's query. If it is not
explicitly mentioned, try to extract it from the metadata of the documents. It can be near the location mentioned in the
query.  QUERY: "'name': 'Galeria Dominikańska', 'description': 'shopping mall', 'addr:city': 'Wrocław'"  DOCUMENTS: 0.
Metadata: {'amenity': 'marketplace', 'name': 'C.H. TARGET', 'address': 'Henryka Michała Kamieńskiego 14, Wrocław',
'lattitude': 51.1412245, 'longitude': 17.0343133}    Rest: {"addr:city": "Wroc\u0142aw", "addr:housenumber": "14",
"addr:postcode": "51-124", "addr:street": "Henryka Micha\u0142a Kamie\u0144skiego", "amenity": "marketplace", "name":
"C.H. TARGET"} 1. Metadata: {'amenity': 'monastery', 'name': 'Refektarz', 'address': 'plac Dominikański 2-4, Wrocław',
'lattitude': 51.1092091, 'longitude': 17.0403065}    Rest: {"addr:city": "Wroc\u0142aw", "addr:country": "PL",
"addr:housenumber": "2-4", "addr:postcode": "50-159", "addr:stre

In [24]:
latitude, longitude, place

(51.1092091,
 17.0403065,
 "'shop': 'hairdresser', 'price': 'less than 100 PLN for a men's haircut'")

In [25]:
results = booksy_vector_store.similarity_search(
    place,
    k=5,
    filter=generate_square_filter((latitude, longitude), radius_km=distance),
)

results

[Document(id='4b0f0456-16cb-484c-8b6f-b56ad6e59e1f', metadata={'name': 'Liudmyla Kuzmuk Studio', 'rating': '5,0', 'lattitude': 51.1031697, 'longitude': 17.0430977, 'address': 'Komuny Paryskiej, 14A, 50-110, Wrocław'}, page_content="Name: Liudmyla Kuzmuk Studio Rating: 5,0 Address: Komuny Paryskiej, 14A, 50-110, Wrocław  Services: [{'price': '100,00 zł', 'name': 'strzyżenie męskie'}, {'price': '150,00 zł', 'name': 'combo włosy broda'}, {'price': '130,00 zł', 'name': 'strzyzenie nozyczkami classic'}, {'price': '100,00 zł', 'name': 'strzyżenie męskie'}, {'price': '150,00 zł', 'name': 'combo włosy broda'}, {'price': '130,00 zł', 'name': 'strzyzenie nozyczkami classic'}, {'price': '70,00 zł', 'name': 'strzyżenie brody'}]"),
 Document(id='87b73d98-ad73-407d-9250-113ddbcd7f7f', metadata={'name': 'PAN OD WŁOSÓW', 'rating': '5,0', 'lattitude': 51.1034862, 'longitude': 17.0292795, 'address': 'plac Tadeusza Kościuszki 5, 50-029, Wrocław'}, page_content="Name: PAN OD WŁOSÓW Rating: 5,0 Address: pl

In [26]:
documents = documents_to_string(results)

answer = summarize_results(QUERY, documents)

 Provide short suummarization based on user query and the documents provided. Recommend the most relevant document.
Write it short and concise.  QUERY: "Find ma a hairdresser that costs less than 100 PLN for a men's haircut near to
Galeria Dominikańska, Wrocław"  DOCUMENTS: 0. Metadata: {'name': 'Liudmyla Kuzmuk Studio', 'rating': '5,0', 'lattitude':
51.1031697, 'longitude': 17.0430977, 'address': 'Komuny Paryskiej, 14A, 50-110, Wrocław'}    Rest: Name: Liudmyla Kuzmuk
Studio Rating: 5,0 Address: Komuny Paryskiej, 14A, 50-110, Wrocław  Services: [{'price': '100,00 zł', 'name':
'strzyżenie męskie'}, {'price': '150,00 zł', 'name': 'combo włosy broda'}, {'price': '130,00 zł', 'name': 'strzyzenie
nozyczkami classic'}, {'price': '100,00 zł', 'name': 'strzyżenie męskie'}, {'price': '150,00 zł', 'name': 'combo włosy
broda'}, {'price': '130,00 zł', 'name': 'strzyzenie nozyczkami classic'}, {'price': '70,00 zł', 'name': 'strzyżenie
brody'}] 1. Metadata: {'name': 'PAN OD WŁOSÓW', 'rating': '5,0'

In [27]:
print('\n'.join(textwrap.wrap(answer, width=TEXT_WRAP_WIDTH)))

 Recommendation: PAN OD WŁOSÓW  Summary: PAN OD WŁOSÓW is a hairdresser located at plac Tadeusza Kościuszki 5, 50-029,
Wrocław, with a rating of 5.0. They offer men's haircuts for 60.00 PLN, which is within your budget. The other
hairdresser, Liudmyla Kuzmuk Studio, offers men's haircuts for 100.00 PLN, which is above your budget.
