In [1]:
!pip install -q cassio datasets langchain openai tiktoken
!pip install -q pycryptodome
!pip install -q PyPDF2

# PDF Query

![alternatvie text](https://miro.medium.com/v2/resize:fit:828/format:webp/1*jMAGouB3s_LA1YoslX5Z_A.png)

In [3]:
# libraris to connect to Cassendra db and perform tasks like text embeddings creating and storing vectors
from langchain.vectorstores.cassandra import Cassandra
# to wrap all thoes vectores in one specif package
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval from Hugging Face
from datasets import load_dataset

# To integrate Astra Db in LAngchain, and initialize the DB  connection
import cassio

from PyPDF2 import PdfReader

  from .autonotebook import tqdm as notebook_tqdm


# Setup

In [4]:
# These two first ids are used to connect to ASTRA DB, which has a cassandra db hosted over there in the cloud
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:UX ..."
ASTRA_DB_ID = "..."

OPENAI_API_KEY = "sk-..."

In [5]:
pdfreader = PdfReader("BudgetSpeech_2017.pdf")

The next step is to devide all our pdf contents into specific chunks.

In [6]:
from typing_extensions import Concatenate

raw_text = ""
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [7]:
raw_text

'BUDGET SPEECH 2017\nMuch of what I am going to relate to the House this afternoon was written at the height of the raging snowstorm \nthat descended upon B.C. earlier this month. From my desk on the second floor of my old Matsqui Prairie farmhouse, I watched the blizzard unfold. \nI watched the snowdrifts grow slowly, imperceptibly at first, until eventually my barn, my garage, and my parents’ \nhouse all disappeared behind a 12-foot-high wall of snow. Doors were blocked shut. We were boxed in and cut off.\nIt occurred to me that budgets and finances can take on a similar dynamic. Without careful attention to spending \nlevels and responsible revenue forecasting, the combined weight of the deficits and debt can accumulate like a prairie\xa0snowdrift and eventually isolate an entire society from the choices it would like to make for its citizens. \nDigging out from beneath the storm isn’t easy and, unlike snowdrifts, budget deficits don’t eventually melt on \ntheir\xa0own. \nThis is th

Now we need to initialize the connection to database

In [8]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

Create the ALngchain embeddings and LLM objects

In [9]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  warn_deprecated(
  warn_deprecated(


# Create LangChain Vector Store

Now we need to create our vectore database. For that we need to initialize Cassandra. It converts all of the text using the embedding (that we initialized at the top cell) into vectors.

In [10]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_demo",
    session=None,
    keyspace=None
)

Still we haven't covert text to vectors. We need to push data inside DB.

In [11]:
from langchain.text_splitter import CharacterTextSplitter

# LLM models have a maximum token limit for the text they can process in one go. 
# We need to Split the text such that it should not increase the token limit.
text_splitter = CharacterTextSplitter(
    separator= "\n",
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len
)

texts = text_splitter.split_text(raw_text)

In [13]:
texts[:1]

['BUDGET SPEECH 2017\nMuch of what I am going to relate to the House this afternoon was written at the height of the raging snowstorm \nthat descended upon B.C. earlier this month. From my desk on the second floor of my old Matsqui Prairie farmhouse, I watched the blizzard unfold. \nI watched the snowdrifts grow slowly, imperceptibly at first, until eventually my barn, my garage, and my parents’ \nhouse all disappeared behind a 12-foot-high wall of snow. Doors were blocked shut. We were boxed in and cut off.\nIt occurred to me that budgets and finances can take on a similar dynamic. Without careful attention to spending']

# Load dataset into the vector store

Adding text, generating embeddings and inserting them inot Astra DB.

In [15]:
astra_vector_store.add_texts(texts[:50])

print("Inserted %i headlines." %len(texts[:50]))

Inserted 50 headlines.


wrap the entries inside the wrapper -> index w.r.t. text

In [19]:

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

# Run QA

Two question examples from the pdf "BudgetSpeech_2017.pdf":
\
Question1: what is the rank of B.C. among other provinces in terms of economic groth?\
Question2: what does Grand Chief Ed John recommended?

In [24]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question or type 'quit' to exit: ").strip()
    else:
        query_text = input("\nEnter your next question or type 'quit' to exit: ").strip()

    if query_text.lower() == "quit":
        break
    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm = llm).strip()
    print("\nANSWER: \"%s\"\n" % answer)

    print("First Documents by Relevance:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:100]))   #100 is number of characters we want to print out as answer




QUESTION: "what does Grand Chief Ed John recommended?"

ANSWER: "He recommended that we continue and expand the "Parents Legal Centre" pilot project delivered by the Legal Services Society to provide early intervention services for families facing child protection concerns."

First Documents by Relevance:
    [0.9098] "In his report, Grand Chief Ed John also recommended that we continue and expand the 
“Parents Legal  ..."
    [0.9098] "In his report, Grand Chief Ed John also recommended that we continue and expand the 
“Parents Legal  ..."
    [0.9045] "a move that will assist approximately 600 families and 1,000 children per year. As well, funding for ..."
    [0.9045] "a move that will assist approximately 600 families and 1,000 children per year. As well, funding for ..."


: 