In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import os 
from langchain.chat_models import ChatOpenAI

api_key = os.getenv('OPENAI_API_KEY')

chat = ChatOpenAI(
    openai_api_key=api_key,
    model='gpt-3.5-turbo',
    request_timeout=120,
)

In [None]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="What is Huber Hurkacz highest positioned ever achieved in the ATP ranking?"),
]

In [None]:
res = chat(messages)
res

In [None]:
urls = [
    'https://en.wikipedia.org/wiki/Hubert_Hurkacz',
    'https://www.atptour.com/en/players/hubert-hurkacz/hb71/overview',
    'https://www.espn.com/tennis/player/_/id/2726/hubert-hurkacz',
    'https://www.reuters.com/sports/tennis/hurkacz-wins-thriller-with-rublev-take-shanghai-title-2023-10-15/',
    'https://www.atptour.com/en/news/hurkacz-rublev-shanghai-2023-final',
    'https://www.atptour.com/en/players/hubert-hurkacz/hb71/player-stats'
]

In [None]:
from langchain.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(urls=urls)
data = loader.load()

In [None]:
from llama_index import BeautifulSoupWebReader


data = BeautifulSoupWebReader().load_data(urls)

with open('data/hurkacz_info.txt', 'w') as f:
    for d in data:
        f.write(d.text)
        

In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


raw_documents = TextLoader('./hurkacz_info.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separator=".")
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, OpenAIEmbeddings())


In [None]:
query = "What is Huber Hurkacz highest positioned ever achieved in the ATP ranking?"
docs = db.similarity_search(query, k=3)
docs

In [None]:
def augment_prompt(query: str):
    results = db.similarity_search(query, k=2)
    source_knowledge = "\n".join([x.page_content for x in results])
    augmented_prompt = f"""Using the contexts below, answer the query.

    The context is regarding the best Polish tennis player, Hubert Hurkacz.
    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt

In [None]:
print(augment_prompt(query))

In [None]:
# create a new user prompt
prompt = HumanMessage(
    content=augment_prompt(query)
)
# add to messages
messages.append(prompt)

res = chat(messages)

print(res.content)
