In [1]:
import os
import uuid

import pandas as pd
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGEngine, PGVectorStore
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

load_dotenv()


CONNECTION_STRING = (
    f"postgresql+asyncpg://{os.environ['POSTGRES_USER']}:{os.environ['POSTGRES_PASSWORD']}@{os.environ['POSTGRES_HOST']}"
    f":{os.environ['POSTGRES_PORT']}/{os.environ['POSTGRES_DB']}"
)

pg_engine = PGEngine.from_connection_string(url=CONNECTION_STRING)

# await pg_engine.ainit_vectorstore_table(
#     table_name=os.environ['TABLE_NAME'],
#     vector_size=os.environ['VECTOR_SIZE'],
# )

embedding = OpenAIEmbeddings(model="text-embedding-3-large")

store = await PGVectorStore.create(
    engine=pg_engine,
    table_name=os.environ['TABLE_NAME'],
    embedding_service=embedding,
)

In [2]:
docs = [
    Document(
        id=str(uuid.uuid4()),
        page_content="Red Apple",
        metadata={"description": "red", "content": "1", "category": "fruit"},
    ),
    Document(
        id=str(uuid.uuid4()),
        page_content="Banana Cavendish",
        metadata={"description": "yellow", "content": "2", "category": "fruit"},
    ),
    Document(
        id=str(uuid.uuid4()),
        page_content="Orange Navel",
        metadata={"description": "orange", "content": "3", "category": "fruit"},
    ),
]

await store.aadd_documents(docs)

['77a42683-e507-4e28-a155-3a0def33c04b',
 '40045658-1f55-44cd-b3b5-766e2d49d19e',
 'f3e45e75-6fd1-4e46-8500-35f1b4e4fb1f']

In [3]:
all_texts = ["Apples and oranges", "Cars and airplanes", "Pineapple", "Train", "Banana"]
metadatas = [{"len": len(t)} for t in all_texts]
ids = [str(uuid.uuid4()) for _ in all_texts]

await store.aadd_texts(all_texts, metadatas=metadatas, ids=ids)

['2487bbba-089b-4d18-aaa3-d8ca9d12acfa',
 'baca5bb3-d61a-40a8-9470-500754ce62e5',
 '72765976-6165-43b3-a8da-8930df4f0f4f',
 'a8faf5cb-8a1b-40b0-b699-cb093b584494',
 'df1f2fb3-6ec2-47be-8fd7-f0f0849f4ccf']

In [6]:
from langchain import hub

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

def extract_data():
    text_chunks = []
    files = filter(lambda f: f.lower().endswith(".pdf"), os.listdir("./content_source/pdf"))
    file_list = list(files)
    for file in file_list:
        loader = PyPDFLoader(os.path.join('content_source', 'pdf', file))
        chunk = loader.load_and_split(text_splitter=RecursiveCharacterTextSplitter(
            chunk_size = 1024,
            chunk_overlap = 30,
            length_function = len,
            separators= ["\n\n", "\n", ".", " "]
        ))
        ids = [str(uuid.uuid4()) for _ in text_chunks]
        try:
            store.add_documents(chunk)
        except Exception as e:
            print(e)

# Define prompt for question-answering
# N.B. for non-US LangSmith endpoints, you may need to specify
# api_url="https://api.smith.langchain.com" in hub.pull.
prompt = hub.pull("rlm/rag-prompt")

extract_data()


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()



In [14]:
response = graph.invoke({"question": "what are the football rules?"})
print(response["answer"])

I don't know.
