In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the OpenAI API key
openai_key = os.getenv('GITHUB_TOKEN')



In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = (
    "./data/Dobermann.pdf"
)

loader = PyPDFLoader(file_path)

dobermann_documents = loader.load()

from rich import print

print(dobermann_documents)

In [3]:
from langchain_community.document_loaders import PyPDFLoader

file_path = (
    "./data/Rottweiler.pdf"
)

loader = PyPDFLoader(file_path)

rottweiler_documents = loader.load()

from rich import print

print(rottweiler_documents)

In [4]:
merged_documents = dobermann_documents + rottweiler_documents

print(merged_documents)

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(merged_documents)

print(f"Split blog post into {len(all_splits)} sub-documents.")

In [6]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=openai_key, base_url="https://models.inference.ai.azure.com")

from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

document_ids = vector_store.add_documents(documents=all_splits)

In [7]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)



In [9]:
from langchain.llms import OpenAI

#llm = OpenAI(api_key=openai_key)
llm = OpenAI(api_key=openai_key, base_url="https://models.inference.ai.azure.com", model="gpt-4o")

question = "which dog is more aggressive, Dobermann or Rottweiler?"

retrieved_docs = vector_store.similarity_search(question)
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
promptAnswer = prompt.invoke({"question": question, "context": docs_content})
answer = llm.invoke(promptAnswer)
print(answer)