In [1]:
###############
# Authors: Justin Zarb
###############

import os
import osmnx as ox
import requests
import sys
from urllib.parse import urlencode
import pandas as pd
import json
sys.path.append("..")
from src.streamlit_functions import get_nodes_with_tags_in_bbox, count_tag_frequency_in_nodes, gdf_data

In [2]:
from tqdm import tqdm
import openai
from config import OPENAI_API_KEY

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import JSONLoader, CSVLoader

from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.indexes import VectorstoreIndexCreator

openai.api_key = OPENAI_API_KEY

In [3]:
PLACE_NAME = "Alt-Treptow"

#### Get all nodes in location

In [10]:
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json][timeout:25];
area[name="{PLACE_NAME}"]->.searchArea;
(node(area.searchArea);
way(area.searchArea);
relation(area.searchArea););out body;>;out skel qt;
"""
response = requests.get(overpass_url, data=overpass_query)

if response.status_code == 200:
    # Request succeeded, process the data here (response.json() or response.text)
    data = response.json()
else:
    # Request failed, check the error message
    error_message = response.text
    print(f"Error: {response.status_code} - {error_message}")

In [11]:
def tag_frequency_from_result(data):
    nodes = [element for element in data["elements"]]
    tag_frequency = json.dumps(count_tag_frequency_in_nodes(nodes))
    return tag_frequency

tag_frequency = tag_frequency_from_result(data)
d = json.loads(tag_frequency)

# Convert the dictionary to a DataFrame
tag_frequency_df = pd.DataFrame(list(d.items()), columns=['key', 'value'])

# Write each value to its own row
tag_frequency_df = tag_frequency_df.explode("value")

#Output option 3:
dirty_file = f"{PLACE_NAME}_node_tags.csv"
tag_frequency_df.to_csv(dirty_file, index=False, header=True)

print(tag_frequency_df.shape)
print(tag_frequency_df.head())

(4026, 2)
       key            value
0  highway  traffic_signals
0  highway   turning_circle
0  highway         crossing
0  highway         bus_stop
0  highway     construction


In [5]:
# Output option 1: Json file with full node data: {"elements":[]}
tagged_nodes_filename = f"{PLACE_NAME}_tagged_nodes.json"

if os.path.exists(tagged_nodes_filename):
    tagged_nodes = [d for d in data["elements"] if "tags" in d]
    data_tagged_only = {"elements":tagged_nodes}

    with open(tagged_nodes_filename, "w") as file:
        # write to file
        file.write(json.dumps(data_tagged_only))

In [6]:
# Output option 2: 


node_tags_filename = f"{PLACE_NAME}_node_tags.json"
if os.path.exists(node_tags_filename):
    with open(node_tags_filename, "w") as file:
        # write to file
        file.write(tag_frequency)

# Vector store of all tags in area

In [29]:
loader = CSVLoader(
    file_path="Alt-Treptow_node_tags.csv"
)
raw_data = loader.load()

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_data)
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [43]:
db.save(f"{PLACE_NAME}_db")

AttributeError: 'Chroma' object has no attribute 'save'

similarity search

In [48]:
# Do a simple cosine search
db.similarity_search("trinkbrünnen", k=5)

[Document(page_content='key: vending\nvalue: drinks', metadata={'row': 2104, 'source': 'Alt-Treptow_node_tags_dirty.csv'}),
 Document(page_content='key: vending\nvalue: drinks', metadata={'row': 2104, 'source': 'Alt-Treptow_node_tags_dirty.csv'}),
 Document(page_content='key: vending\nvalue: drinks', metadata={'row': 2104, 'source': 'Alt-Treptow_node_tags_dirty.csv'}),
 Document(page_content='key: vending\nvalue: drinks', metadata={'row': 3160, 'source': 'Alt-Treptow_node_tags.csv'}),
 Document(page_content='key: emergency\nvalue: drinking_water', metadata={'row': 1907, 'source': 'Alt-Treptow_node_tags_dirty.csv'})]

#### Retriever
 A retriever gets the most relevant documents for an unstructured query. ([Langchain Retrievers]( https://python.langchain.com/docs/modules/data_connection/retrievers/))


In [14]:
index = VectorstoreIndexCreator().from_loaders([loader])

In [35]:
query = "Provide five possible key:value pairs to search for falafel restaurants"
index.query_with_sources(query)

{'question': 'Provide five possible key:value pairs to search for falafel restaurants',
 'answer': " Possible key:value pairs to search for falafel restaurants include: \n- name: Jimmy's Falafel \n- cuisine: falafel \n- type: restaurant \n- food: falafel \n- location: city \n",
 'sources': 'Alt-Treptow_node_tags_dirty.csv'}

Multiquery Retriever

In [22]:
from typing import List
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.output_parsers import PydanticOutputParser

In [18]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [20]:
unique_docs = retriever_from_llm.get_relevant_documents(question)
len(unique_docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. Can you provide key:value pairs for Zierbrunnen?', '2. What are some key:value pairs for Badestellen?', '3. Could you give me key:value pairs for Strandbäder, Freibäder, Schwimmhallen, Bänke, Picknicktische, and Trinkbrunnen?']


5

In [42]:
# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)

output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions seperated by newlines.
    Original question: {question}""",
)

llm = ChatOpenAI(temperature=0)

# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Other inputs
question = "places where people can drink water, splash around or swim"

# Run
retriever = MultiQueryRetriever(
    retriever=db.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)  # "lines" is the key (attribute name) of the parsed output

# Results
unique_docs = retriever.get_relevant_documents(
    question
)
unique_docs


INFO:langchain.retrievers.multi_query:Generated queries: ['1. What are some locations where individuals can access drinking water, engage in water activities, or swim?', '2. Can you suggest places where people have the opportunity to drink water, play in the water, or go swimming?', "3. I'm looking for places that offer drinking water, water play areas, or swimming opportunities. Any recommendations?", '4. Where can people go to find drinking water, areas to splash around, or places to swim?', '5. Are there any specific locations that provide access to drinking water, areas for water play, or swimming spots?']


[Document(page_content='key: sport\nvalue: swimming', metadata={'row': 2687, 'source': 'Alt-Treptow_node_tags.csv'}),
 Document(page_content='key: drinking_water\nvalue: yes', metadata={'row': 1300, 'source': 'Alt-Treptow_node_tags_dirty.csv'}),
 Document(page_content='key: drinking_water\nvalue: yes', metadata={'row': 1898, 'source': 'Alt-Treptow_node_tags.csv'})]

# Get data with OSMNX or Overpy

In [None]:
# OSMNX
fuel_stations = ox.geometries_from_place(PLACE_NAME, {"wikipedia": "de:Liste der Straßenbrunnen im Berliner Bezirk Treptow-Köpenick"})
fuel_stations

# Old stuff

In [None]:
# Gives the number of unique values for each of the tags. Maybe useful
num_unique_values = {k: len(v) for k, v in json.loads(tag_frequency).items()}
num_unique_values = {
    k: v
    for k, v in sorted(
        num_unique_values.items(), key=lambda item: item[1], reverse=True
    )
}


In [None]:
places = ["Mitte, Berlin"]
places_gdf = ox.geocode_to_gdf(places)
# bbox = [S, W, N, E]
bounding_boxes = places_gdf.loc[:, ["bbox_south", "bbox_west", "bbox_north", "bbox_east",]]
places_gdf

In [None]:
import pandas as pd

ping_pong = {'version': 0.6, 'generator': 'Overpass API 0.7.60.6 e2dc3e5b', 'osm3s': {'timestamp_osm_base': '2023-06-29T15:35:14Z', 'timestamp_areas_base': '2023-06-29T12:13:45Z', 'copyright': 'The data included in this document is from www.openstreetmap.org. The data is made available under ODbL.'}, 'elements': [{'type': 'node', 'id': 6835150496, 'lat': 52.5226885, 'lon': 13.3979877, 'tags': {'leisure': 'pitch', 'sport': 'table_tennis', 'wheelchair': 'yes'}}, {'type': 'node', 'id': 6835150497, 'lat': 52.5227083, 'lon': 13.3978939, 'tags': {'leisure': 'pitch', 'sport': 'table_tennis', 'wheelchair': 'yes'}}, {'type': 'node', 'id': 6835150598, 'lat': 52.5229822, 'lon': 13.3965893, 'tags': {'access': 'customers', 'leisure': 'pitch', 'sport': 'table_tennis'}}, {'type': 'node', 'id': 6835150599, 'lat': 52.5229863, 'lon': 13.3964894, 'tags': {'access': 'customers', 'leisure': 'pitch', 'sport': 'table_tennis'}}]}

df = pd.DataFrame(ping_pong["elements"])
df["tags"]

In [None]:
import json
keys = []
nodes = []
for _, row in bounding_boxes.iterrows():
    nodes.append(get_nodes_with_tags_in_bbox(list(row)))
    unique_tags_dict = count_tag_frequency(nodes)
    num_unique_values = {k:len(v) for k, v in unique_tags_dict.items()}
    num_unique_values = {
        k: v
        for k, v in sorted(
            num_unique_values.items(), key=lambda item: item[1], reverse=True
        )
    }

unique_tags_dict_sorted = [unique_tags_dict[k] for k in list(num_unique_values.keys())]

unique_tags_dict

In [None]:
def search_dict(d, substring):
    search_words = [s.strip() for s in substring.split(",")]
    print(search_words)
    matches = {}
    for s in search_words:
    # Add key value pairs if a substring appears in either key or value. Value is a list of strings. return only the matching string
        for key, value in d.items():
            if s in key:
                matches[key] = value
            else:
                for v in value:
                    if s in v:
                        if key in matches:
                            matches[key].append(v)
                        else:
                            matches[key] = [v]
    return matches

search_dict(unique_tags_dict, "history, historical")

In [None]:

dict(zip(places_gdf["display_name"], places_gdf["projected_area"]))

In [None]:
places_gdf[["projected_area", "area_unit"]] = places_gdf.apply(lambda row: gdf_data(row, places_gdf.crs), axis=1)
places_gdf

In [None]:
for _, row in bounding_boxes.iterrows():
    nodes.append(get_nodes_with_tags_in_bbox(list(row)))
    keys = list(count_tag_frequency(nodes).keys())
keys

In [None]:
import utm
from pyproj import CRS
import geopandas as gpd
def gdf_data(gdf):
    """Get the area of a polygon
    This method is not directly callable by the LLM"""
    places_dict = {}
    for index, row in gdf.iterrows():
        print(index)
        utm_zone = utm.latlon_to_zone_number(gdf.loc[[index], "lat"].values[0], gdf.loc[[index], "lon"].values[0])
        south = gdf.loc[[index], "lat"].values[0] < 0
        crs = CRS.from_dict({"proj": "utm", "zone": utm_zone, "south": south})
        epsg_code = crs.to_authority()[1]
        unit = list({ai.unit_name for ai in crs.axis_info})[0]
        gdf_projected = gdf.loc[[index],:].to_crs(epsg_code)
        area = gdf_projected.area.values[0]
        places_dict[row["display_name"]] = {"area":area,
                                            "unit":unit}

    return places_dict

In [None]:
gdf_data(places_gdf)