### Notebook to run RAG setup basic (v1)

### Step 1: Initialize notebook

In [6]:
import os
import json
import bs4 
import getpass
import requests
import faiss 
import numpy as np

from typing import Literal
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, Annotated, TypedDict
from langchain_core.prompts import PromptTemplate

from dotenv import load_dotenv, find_dotenv
from utils import *
load_dotenv(find_dotenv())

In [7]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

### Step 2: Initialize model objects

In [8]:
llm = ChatOpenAI(model="gpt-4o-mini")

In [9]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

### Single document extract 

In [None]:
# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://petguide.dk/hundefoder-maerker/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("entry-content single-page", "entry-title", "entry-meta uppercase is-xsmall")
        )
    ),
)
docs = loader.load()

In [None]:
# initiate the text splitter 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

total_documents = len(docs)
third = total_documents // 3

# Split the documents into chunks
all_splits = []
for doc in docs:
    splits = text_splitter.split_documents([doc])
    num_splits = len(splits)
    third = num_splits // 3
    
    for i, split in enumerate(splits):
        split.metadata["source"] = doc.metadata.get("source", "Unknown")
        if i < third:
            split.metadata["section"] = "beginning"
        elif i < 2 * third:
            split.metadata["section"] = "middle"
        else:
            split.metadata["section"] = "end"
    all_splits.extend(splits)
    
print(f"Split blog post into {len(all_splits)} sub-documents.")

In [None]:
# Create embeddings for each chunk
embeddings_list = [embeddings.embed_query(doc.page_content) for doc in all_splits]
embeddings_array = np.array(embeddings_list)

# Create a FAISS index
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_array)

### Multiple documents extract

In [10]:
main_url = "https://petguide.dk/bloggen/"
print(f'Getting article links from {main_url}...')
article_links = get_article_links(main_url)
print(f'Found {len(article_links)} article links...')
all_splits, index = load_and_chunk_documents(article_links[:5])

### Initialize RAG

In [19]:
custom_rag_prompt = PromptTemplate.from_template(prompt_template)

In [20]:
class AnswerWithSources(TypedDict):
    """An answer to the question, with sources."""
    answer: str
    sources: Annotated[
        List[str],
        ...,
        "List of sources (author + year) used to answer the question",
    ]

In [21]:
# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: AnswerWithSources


In [None]:
def retrieve(state: State):
    # Create embeddings for the query
    query_embedding = embeddings.embed_query(state["question"])
    query_embedding = np.array([query_embedding])
    
    # Perform similarity search
    distances, indices = index.search(query_embedding, k=5)
    retrieved_docs = [all_splits[i] for i in indices[0]]
    print(f"Retrieved {len(retrieved_docs)} documents for the question: {state['question']}")
    return {"context": retrieved_docs}

In [49]:
def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = custom_rag_prompt.invoke({"question": state["question"], "context": docs_content})
    structured_llm = llm.with_structured_output(AnswerWithSources)
    response = structured_llm.invoke(messages)
    # Extract unique URLs from the context
    unique_urls = list({doc.metadata['source'] for doc in state["context"]})
    
    # Update the response with the unique URLs
    response['sources'] = unique_urls
    
    state['answer'] = response
    return state


In [50]:
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
result = graph.invoke({"question": "Hvilket fodermærke er rig på protein, fedt, er kornfrit og produceres i Canada?"})

print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')