In [None]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [34]:

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv



In [71]:

load_dotenv()


True

In [70]:

OPENAI_API_KEY=os.environ.get("OPENAI_API_KEY")


In [69]:
#Define our LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats please")

AIMessage(content='Why did the cat sit on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 14, 'total_tokens': 35, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0aa8d3e20b', 'finish_reason': 'stop', 'logprobs': None}, id='run-cb93c0fc-ce0e-472e-b874-6485352c512b-0', usage_metadata={'input_tokens': 14, 'output_tokens': 21, 'total_tokens': 35, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [68]:
## Process the PDF Document
loader = PyPDFLoader('data/Annie Jacobsen - Operation Paperclip. The Secret Intelligence Program that Brought Nazi Scientists to America - 2014.pdf')
pages= loader.load()
pages

[Document(metadata={'source': 'data/Annie Jacobsen - Operation Paperclip. The Secret Intelligence Program that Brought Nazi Scientists to America - 2014.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'data/Annie Jacobsen - Operation Paperclip. The Secret Intelligence Program that Brought Nazi Scientists to America - 2014.pdf', 'page': 1}, page_content=''),
 Document(metadata={'source': 'data/Annie Jacobsen - Operation Paperclip. The Secret Intelligence Program that Brought Nazi Scientists to America - 2014.pdf', 'page': 2}, page_content=''),
 Document(metadata={'source': 'data/Annie Jacobsen - Operation Paperclip. The Secret Intelligence Program that Brought Nazi Scientists to America - 2014.pdf', 'page': 3}, page_content='Begin Reading\nTable of Contents\nPhotos\nNewsletters\nCopyright Page\nIn accordance with the U.S. Copyright Act of 1976, the scanning, uploading,\nand electronic sharing of any part of this book without the permission of the\npublisher constitute 

In [67]:
## Split Document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                               chunk_overlap=200,
                                               length_function=len,
                                               separators= ["\n\n", '\n', " "])
chunks = text_splitter.split_documents(pages)

In [66]:
## Create Embeddings

def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model= "text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector= embedding_function.embed_query('cat')


In [65]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance",
                           embeddings = embedding_function)

evaluator.evaluate_strings(prediction = "Nazi", reference = "German")

{'score': 0.14685544198944933}

In [64]:
evaluator.evaluate_strings(prediction = "Nazi", reference = "American")

{'score': 0.19374403764037063}

In [62]:
## Create A Vector database

import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    #create a list if unique ids for each document bsed on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    #ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []

    unique_chunks = []
    for chunk, id in zip(chunks.ids):
         if id not in unique_ids:
              unique_ids.add(id)
              unique_chunks.append(chunk)

    
    # Create a New Chroma Database for the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks,
                                        ids = list(unique_ids),
                                        embedding=embedding_function,
                                        persist_directory=vectorstore_path)
    
    vectorstore.persist()

    return vectorstore




In [63]:
#create a vecotr store
vectorstore = create_vectorstore(chunks=chunks,
                                  embedding_function=embedding_function,
                                  vectorstore_path="vectorstore_chroma")



AttributeError: 'list' object has no attribute 'ids'

In [45]:
# Load vectorestore
vectorstore= Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [48]:
# Create retirever and get relevant chunks
retriever = vectorstore.as_retriever(search_type='similarity')
relevant_chunks = retriever.invoke('What is the title of the book?')
relevant_chunks

[]

In [55]:
# Prompt Template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks. 
Use the following pieces of retireved context to answer
the quesion. If you don't know the answer, say that you don't
know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the context above: {question}
"""

In [54]:
#concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

#create prompt 
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text,
                                question= "What is the title of the book")
print(prompt)

Human: 
You are an assistant for question-answering tasks. 
Use the following pieces of retireved context to answer
the quesion. If you don't know the answer, say that you don't
know. DON'T MAKE UP ANYTHING.



---

Answer the question based on the context above: What is the title of the book



In [56]:
#generate rsponses
llm.invoke(prompt)

AIMessage(content="I don't know.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 74, 'total_tokens': 79, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_d02d531b47', 'finish_reason': 'stop', 'logprobs': None}, id='run-75e9bfc4-6bee-4696-bd80-2191018ffdf5-0', usage_metadata={'input_tokens': 74, 'output_tokens': 5, 'total_tokens': 79, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [None]:
#uing langchain expression lnaguage
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)
rag_chain.invoke("Give me a paragraph of information about this book please")


AIMessage(content="I'm sorry, but I don't have any specific information about the book you are referring to. Could you provide more details or context?", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 75, 'total_tokens': 102, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0aa8d3e20b', 'finish_reason': 'stop', 'logprobs': None}, id='run-7ff64c93-626f-4bf7-a251-1fcbc649a320-0', usage_metadata={'input_tokens': 75, 'output_tokens': 27, 'total_tokens': 102, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [72]:
## Generate structured response
class AnswerWithSources(BaseModel):
    """An answer to the question with sources and reasoning"""
    answer: str=Field(description="answer to question")
    sources: str = Field(description = "full direct tect chunk from the context used to answer the question")
    reasoning: str = Field(description="explain the reasoning of the asnwer based on the sources")

class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

In [73]:
rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt_template
        | llm.with_structured_output(ExtractedInfo, strict=True)
    )
rag_chain.invoke("Give me the title, summary, publication date, authors of the paper please.")

ExtractedInfo(paper_title=AnswerWithSources(answer='Title of the Paper', sources="The title of the paper is mentioned as 'Title of the Paper'.", reasoning='The title is explicitly stated in the retrieved context.'), paper_summary=AnswerWithSources(answer='This paper explores the effects of X on Y.', sources="The summary states, 'This paper explores the effects of X on Y.'", reasoning='The summary is a concise description provided in the context.'), publication_year=AnswerWithSources(answer='2023', sources='The publication year is cited as 2023.', reasoning='The year is clearly indicated in the retrieved context.'), paper_authors=AnswerWithSources(answer='John Doe, Jane Smith', sources='The authors are listed as John Doe and Jane Smith.', reasoning="The authors' names are provided in the context."))

In [75]:
#transform response into a dataframe

structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")
df = pd.DataFrame([structured_response.dict()])

answer_row = []
source_row =[]
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

#create new dataframe with 2 rows: answers and source_row
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer','source','reasoning'])
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,The Impact of Climate Change on Marine Biodive...,This research paper explores the various ways ...,2022,"John Doe, Jane Smith, and Emily Johnson"
source,The Impact of Climate Change on Marine Biodive...,This research paper explores the various ways ...,Published in 2022,"John Doe, Jane Smith, and Emily Johnson"
reasoning,The title is explicitly mentioned in the context.,The summary is directly provided in the context.,The publication date is mentioned in the context.,The authors are listed in the context.
