In [None]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [34]:

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv



In [None]:

load_dotenv()


In [70]:

OPENAI_API_KEY=os.environ.get("OPENAI_API_KEY")


In [None]:
#Define our LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats please")

In [None]:
## Process the PDF Document
loader = PyPDFLoader('data/Annie Jacobsen - Operation Paperclip. The Secret Intelligence Program that Brought Nazi Scientists to America - 2014.pdf')
pages= loader.load()
pages

In [67]:
## Split Document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                               chunk_overlap=200,
                                               length_function=len,
                                               separators= ["\n\n", '\n', " "])
chunks = text_splitter.split_documents(pages)

In [66]:
## Create Embeddings

def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model= "text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector= embedding_function.embed_query('cat')


In [None]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance",
                           embeddings = embedding_function)

evaluator.evaluate_strings(prediction = "Nazi", reference = "German")

In [None]:
evaluator.evaluate_strings(prediction = "Nazi", reference = "American")

In [76]:
## Create A Vector database

import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    #create a list if unique ids for each document bsed on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    #ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []

    unique_chunks = []
    for chunk, id in zip(chunks,ids):
         if id not in unique_ids:
              unique_ids.add(id)
              unique_chunks.append(chunk)

    
    # Create a New Chroma Database for the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks,
                                        ids = list(unique_ids),
                                        embedding=embedding_function,
                                        persist_directory=vectorstore_path)
    
    vectorstore.persist()

    return vectorstore




In [None]:
#create a vecotr store
vectorstore = create_vectorstore(chunks=chunks,
                                  embedding_function=embedding_function,
                                  vectorstore_path="vectorstore_chroma")



In [78]:
# Load vectorestore
vectorstore= Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [None]:
# Create retirever and get relevant chunks
retriever = vectorstore.as_retriever(search_type='similarity')
relevant_chunks = retriever.invoke('What is the title of the book?')
relevant_chunks

In [80]:
# Prompt Template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks. 
Use the following pieces of retireved context to answer
the quesion. If you don't know the answer, say that you don't
know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the context above: {question}
"""

In [82]:
#concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

#create prompt 
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text,
                                question= "What is the title of the book")
print(prompt)

Human: 
You are an assistant for question-answering tasks. 
Use the following pieces of retireved context to answer
the quesion. If you don't know the answer, say that you don't
know. DON'T MAKE UP ANYTHING.

CONTENTS
Cover
Title Page
Welcome
Dedication
Prologue
Part I
Chapter 1     The War and the Weapons
Chapter 2     Destruction
Chapter 3     The Hunters and the Hunted
Chapter 4     Liberation
Chapter 5     The Captured and Their Interrogators
Part II
Chapter 6     Harnessing the Chariot of Destruction
Chapter 7     Hitler’s Doctors
Chapter 8      Black, White, and Gray
Chapter 9     Hitler’s Chemists
Chapter 10   Hired or Hanged

---

Begin Reading
Table of Contents
Photos
Newsletters
Copyright Page
In accordance with the U.S. Copyright Act of 1976, the scanning, uploading,
and electronic sharing of any part of this book without the permission of the
publisher constitute unlawful piracy and theft of the author’s intellectual
property. If you would like to use material from the book

In [83]:
#generate rsponses
llm.invoke(prompt)

AIMessage(content='The title of the book is not provided in the retrieved context.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 674, 'total_tokens': 688, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0aa8d3e20b', 'finish_reason': 'stop', 'logprobs': None}, id='run-ed372fd4-70e2-49ad-a935-92011d99af01-0', usage_metadata={'input_tokens': 674, 'output_tokens': 14, 'total_tokens': 688, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [84]:
#uing langchain expression lnaguage
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)
rag_chain.invoke("Give me a paragraph of information about this book please")


AIMessage(content="The book, published by Hachette Digital, is copyrighted in 2014 by Anne M. Jacobsen and is associated with Little, Brown and Company, a division of Hachette Book Group. The ebook edition was first released in February 2014. The content includes various chapters that explore themes such as war, destruction, and the roles of different figures during that time, including Hitler's doctors and chemists. For more information about the book and its author, readers are directed to visit Bookish.com. Additionally, the publisher emphasizes the importance of respecting copyright and the author's rights, stating that any unauthorized sharing of the book's material constitutes unlawful piracy.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 134, 'prompt_tokens': 672, 'total_tokens': 806, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_t

In [85]:
## Generate structured response
class AnswerWithSources(BaseModel):
    """An answer to the question with sources and reasoning"""
    answer: str=Field(description="answer to question")
    sources: str = Field(description = "full direct tect chunk from the context used to answer the question")
    reasoning: str = Field(description="explain the reasoning of the asnwer based on the sources")

class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

In [86]:
rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt_template
        | llm.with_structured_output(ExtractedInfo, strict=True)
    )
rag_chain.invoke("Give me the title, summary, publication date, authors of the paper please.")

ExtractedInfo(paper_title=AnswerWithSources(answer='The title of the paper is not explicitly mentioned in the provided context.', sources='', reasoning='The context provided does not contain a specific title for a paper, but rather appears to be an excerpt from a book.'), paper_summary=AnswerWithSources(answer='The summary of the paper is not available in the provided context.', sources='', reasoning='The context provided includes a table of contents and copyright information but does not include a summary of the paper.'), publication_year=AnswerWithSources(answer='2014', sources='Copyright © 2014 by Anne M. Jacobsen', reasoning='The copyright information states that it was published in 2014.'), paper_authors=AnswerWithSources(answer='Anne M. Jacobsen', sources='Copyright © 2014 by Anne M. Jacobsen', reasoning='The copyright information indicates that Anne M. Jacobsen is the author.'))

In [87]:
#transform response into a dataframe

structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")
df = pd.DataFrame([structured_response.dict()])

answer_row = []
source_row =[]
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

#create new dataframe with 2 rows: answers and source_row
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer','source','reasoning'])
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,The title of the research paper is not explici...,A summary of the research paper is not provide...,2014,Anne M. Jacobsen
source,,,Copyright © 2014 by Anne M. Jacobsen,Copyright © 2014 by Anne M. Jacobsen
reasoning,The context does not include a specific title ...,The provided context does not include a summar...,The copyright information indicates that the w...,The copyright information names Anne M. Jacobs...
