In [16]:
import configparser
import os
import json
import pandas as pd
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import numpy as np

In [11]:
from langchain.document_loaders import TextLoader
from operator import itemgetter
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain.vectorstores import FAISS

In [77]:
config = configparser.ConfigParser()
config.read('config.ini')

# Define the API Key.
API_KEY = config['OPENAI_API']['API_KEY']
os.environ['OPENAI_API_KEY'] = API_KEY
HF_API_KEY = config['HF_API']['API_KEY']

In [4]:
from sqlalchemy import create_engine

# Read the MySQL configuration from the JSON file
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

# Extract MySQL connection details
mysql_config = config.get('mysql', {})
username = mysql_config.get('username', 'default_username')
password = mysql_config.get('password', 'default_password')
host = mysql_config.get('host', 'localhost')
database_name = mysql_config.get('database_name', 'your_database')

# Create the MySQL database connection string
db_url = f"mysql+mysqlconnector://{username}:{password}@{host}/{database_name}"

# Create an SQLAlchemy engine
engine = create_engine(db_url)

# Use the engine to connect to the database
connection = engine.connect()

# Specify the SQL query to retrieve data from a table
query = "SELECT * FROM rome_geo_tags"

# Use Pandas to read data from the database into a DataFrame
df = pd.read_sql(query, connection)

# Close the database connection
connection.close()

df.head()

Unnamed: 0,gt_id,gt_lat,gt_lon,gt_page_id,url
0,306683262,41.884,12.491,48234033,http://en.wikipedia.org/?curid=48234033
1,306683263,41.886,12.495,48234033,http://en.wikipedia.org/?curid=48234033
2,306683264,41.891389,12.480278,48234033,http://en.wikipedia.org/?curid=48234033
3,306683265,41.89385,12.48194,48234033,http://en.wikipedia.org/?curid=48234033
4,306683266,41.907222,12.498611,48234033,http://en.wikipedia.org/?curid=48234033


In [5]:
wiki = wikipediaapi.Wikipedia(user_agent="krystek.pietrzak@gmail.com", language='en', extract_format=wikipediaapi.ExtractFormat.WIKI)

In [58]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # Convert decimal degrees to radians 
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula 
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

def getURLs(location):
    """
    Returns the 5 closest locations to the given location.
    """
    lat, lon = location
    # Calculate distances from the given location to all locations in the dataframe
    df['distance'] = df.apply(lambda row: haversine(lat, lon, row['gt_lat'], row['gt_lon']), axis=1)

    # Filter locations within 500 meters
    close_df = df[df['distance'] <= 0.5]

    if close_df.empty:
        return [df.nsmallest(1, 'distance')['url'].iloc[0]]
    
    # Otherwise, return up to 5 closest locations within 500 meters
    return close_df.nsmallest(5, 'distance')['url'].tolist()

# Example usage:
location = (41.8952293,12.4764618)


In [59]:
urls = getURLs(location)

In [62]:
def getText(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract data from the webpage and append it to scraped_data
        title = soup.find_all('title')[0].text[:-12]
        if title[:4] == 'User' or title[:4] == 'List':
           return ''
        else:
            print(f'Title of the page: {title}')
            
            page_py = wiki.page(title)
            if page_py.exists():
                print(f"Page ID; {page_py.pageid}")
                return page_py.text
            else:
                print('Page does not exist')
                return ''
            
    else:
        print(f"Failed to fetch data from {url}")
        return ''


In [63]:
texts = ''
for url in urls:
    texts += getText(url) + '\n'

with open('text.txt', 'w') as f:
    f.write(texts)

Title of the page: Largo di Torre Argentina
Page ID; 2233857
Title of the page: Assassination of Julius Caesar
Page ID; 15775663
Title of the page: Teatro Argentina
Page ID; 4266246
Title of the page: San Giuliano dei Fiamminghi
Page ID; 24145533


In [64]:
# Document Loader
loader = TextLoader('text.txt')
documents = loader.load()

In [65]:
# Text Splitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

Created a chunk of size 1210, which is longer than the specified 1000
Created a chunk of size 2237, which is longer than the specified 1000
Created a chunk of size 1145, which is longer than the specified 1000
Created a chunk of size 4066, which is longer than the specified 1000
Created a chunk of size 7141, which is longer than the specified 1000
Created a chunk of size 5144, which is longer than the specified 1000
Created a chunk of size 1447, which is longer than the specified 1000
Created a chunk of size 4144, which is longer than the specified 1000
Created a chunk of size 1304, which is longer than the specified 1000
Created a chunk of size 1251, which is longer than the specified 1000
Created a chunk of size 4264, which is longer than the specified 1000
Created a chunk of size 1005, which is longer than the specified 1000


In [66]:
vectorstore = FAISS.from_documents(docs, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

model = ChatOpenAI()

In [74]:
template = """You are a travel guide. Answer the question based only on the following context from locations close to you:
{context}
And on the descripiton of what I currently see:
{image_description}
Question: {question}
Answer this question prioritizing context rather than image description.
"""
prompt = ChatPromptTemplate.from_template(template)

chain = {
    "context": itemgetter("question") | retriever, 
    "question": itemgetter("question"), 
    "image_description": itemgetter("image_description")
} | prompt | model | StrOutputParser()

In [78]:
import requests

API_URL = "https://api-inference.huggingface.co/models/microsoft/git-large-coco"
headers = {"Authorization": f"Bearer {HF_API_KEY}"}

def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

output = query("Screenshot 2023-10-06 at 15.22.58.png")

In [75]:
chain.invoke({"question": "What is this place", "image_description": f"{output}"})

'Based on the provided context, the place you are currently seeing is the Roman Forum in Rome, Italy.'