In [None]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [34]:

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv



In [None]:

load_dotenv()


In [36]:

OPENAI_API_KEY=os.environ.get("OPENAI_API_KEY")


In [None]:
#Define our LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats please")

In [None]:
## Process the PDF Document
loader = PyPDFLoader('data/Annie Jacobsen - Operation Paperclip. The Secret Intelligence Program that Brought Nazi Scientists to America - 2014.pdf')
pages= loader.load()
pages

In [39]:
## Split Document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                               chunk_overlap=200,
                                               length_function=len,
                                               separators= ["\n\n", '\n', " "])
chunks = text_splitter.split_documents(pages)

In [40]:
## Create Embeddings

def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model= "text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector= embedding_function.embed_query('cat')


In [None]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance",
                           embeddings = embedding_function)

evaluator.evaluate_strings(prediction = "Nazi", reference = "German")

In [None]:
evaluator.evaluate_strings(prediction = "Nazi", reference = "American")

In [43]:
## Create A Vector database

import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    #create a list if unique ids for each document bsed on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    #ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []

    unique_chunks = []
    for chunk, id in zip(chunks.ids):
         if id not in unique_ids:
              unique_ids.add(id)
              unique_chunks.append(chunk)

    
    # Create a New Chroma Database for the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks,
                                        ids = list(unique_ids),
                                        embedding=embedding_function,
                                        persist_directory=vectorstore_path)
    
    vectorstore.persist()

    return vectorstore




In [47]:
#create a vecotr store
vectorstore = create_vectorstore(chunks=chunks,
                                  embedding_function=embedding_function,
                                  vectorstore_path="vectorstore_chroma")



AttributeError: 'list' object has no attribute 'ids'

In [45]:
# Load vectorestore
vectorstore= Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [48]:
# Create retirever and get relevant chunks
retriever = vectorstore.as_retriever(search_type='similarity')
relevant_chunks = retriever.invoke('What is the title of the book?')
relevant_chunks

[]

In [55]:
# Prompt Template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks. 
Use the following pieces of retireved context to answer
the quesion. If you don't know the answer, say that you don't
know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the context above: {question}
"""

In [54]:
#concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

#create prompt 
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text,
                                question= "What is the title of the book")
print(prompt)

Human: 
You are an assistant for question-answering tasks. 
Use the following pieces of retireved context to answer
the quesion. If you don't know the answer, say that you don't
know. DON'T MAKE UP ANYTHING.



---

Answer the question based on the context above: What is the title of the book



In [56]:
#generate rsponses
llm.invoke(prompt)

AIMessage(content="I don't know.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 74, 'total_tokens': 79, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_d02d531b47', 'finish_reason': 'stop', 'logprobs': None}, id='run-75e9bfc4-6bee-4696-bd80-2191018ffdf5-0', usage_metadata={'input_tokens': 74, 'output_tokens': 5, 'total_tokens': 79, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [None]:
#uing langchain expression lnaguage
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)
rag_chain.invoke("Give me a paragraph of information about this book please")


AIMessage(content="I'm sorry, but I don't have any specific information about the book you are referring to. Could you provide more details or context?", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 75, 'total_tokens': 102, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0aa8d3e20b', 'finish_reason': 'stop', 'logprobs': None}, id='run-7ff64c93-626f-4bf7-a251-1fcbc649a320-0', usage_metadata={'input_tokens': 75, 'output_tokens': 27, 'total_tokens': 102, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [60]:
## Generate structured response
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: str = Field(description="Title of the paper")
    paper_summary: str = Field(description = "Summary of the paper")
    publication_year: int = Field(description="Year of the publication")
    paper_authors: str = Field(description= "Names of the authors of this paper")

In [61]:
rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt_template
        | llm.with_structured_output(ExtractedInfo, strict=True)
    )
rag_chain.invoke("Give me the title, summary, publication date, authors of the paper please.")

ExtractedInfo(paper_title='Not provided', paper_summary='Not provided', publication_year=0, paper_authors='Not provided')