In [2]:
# PDF reader
from PyPDF2 import PdfReader
# langchain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
# Azure authentication
from azure.identity import ClientSecretCredential
import configs.constants as const
import configs.deployments as dep
# FAISS
from langchain.vectorstores import FAISS

In [13]:
# authenticate to Azure
credentials = ClientSecretCredential(const.IQVIA_TENANT_ID, const.SERVICE_PRINCIPAL, const.SERVICE_PRINCIPAL_SECRET)
token = credentials.get_token(const.SCOPE_NON_INTERACTIVE)

# gpt-35-turbo
llm = AzureChatOpenAI(
    openai_api_base=f"{const.OPENAI_API_BASE}/{const.OPENAI_API_TYPE}/{const.OPENAI_ACCOUNT_NAME}",
    openai_api_version=const.OPENAI_API_VERSION,
    openai_api_key=token.token,
    deployment_name=dep.GPT_35_TURBO,
    openai_api_type="azure_ad")

# deployment-text-embedding-ada-002	
embeddings = OpenAIEmbeddings(
    openai_api_base=f"{const.OPENAI_API_BASE}/{const.OPENAI_API_TYPE}/{const.OPENAI_ACCOUNT_NAME}",
    openai_api_version=const.OPENAI_API_VERSION,
    openai_api_key=token.token,
    openai_api_type="azure_ad",
    deployment=dep.TEXT_EMBEDDING_ADA_002,
    chunk_size=1)

# Using the split_text method of the text_splitter object to split the raw_text string into chunks of 500 characters separated by newline character
# text_splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=0)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chain = load_qa_chain(llm, chain_type='stuff', verbose=True)


In [14]:
pdf_path = "C:\\Users\\u1128714\\Desktop\\prompt engineering\\MEASURING AND NARROWING.pdf"
reader = PdfReader(pdf_path)

# Initializing an empty string variable called raw_text
raw_text = ''
# Using a for loop to iterate through the pages in the reader object and extract text from each page
for i, page in enumerate(reader.pages):
    # Extracting text from the current page using the extract_text() method
    text = page.extract_text()
    if text:
        raw_text += text
texts = text_splitter.split_text(raw_text)

In [15]:
docsearch = FAISS.from_texts(texts, embeddings)

In [16]:
query = "What is the purpose of this document?"
docs = docsearch.similarity_search(query)
answer = chain.run(input_documents=docs, question=query, verbose=True)
# Printing the answer returned by the QA chain model
print(answer)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
deduce previously unseen knowledge. For example, a model should be able to answer “How long
was Queen Elizabeth’s reign?” even if the answer did not explicitly appear in the training data, by
recalling her coronation and death dates and reasoning over these facts. While language models
(LMs) have shown strong question answering performance, it remains unclear how much is due to
memorization of huge corpora vs how much is due to reasoning.
First, we quantify the reasoning abilities of LMs using multi-hop question answering. We present
a new, automatically generated dataset, Compositional Celebrities (CC), of 8.6k 2-hop questions; it
combines frequently stated facts in improbable ways (e.g., “Who won 