In [1]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from datasets import load_dataset
import cassio
from PyPDF2 import PdfReader
import os

### Setup

In [2]:
ASTRA_DB_APPLICATION_TOKEN = os.getenv('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_ID = os.getenv('ASTRA_DB_ID')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [4]:
# provide the path of  pdf file/files.
pdfreader = PdfReader("pdf_data\USHC_combined.pdf")

In [5]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [6]:
raw_text

'Insurance\nhow insurance  \nworksHealth insurance is one of the best ways you can protect yourself \nand your family in case you get sick or injured and need medical \ncare. It also helps you get the regular medical and dental care \nyou need to stay healthy. With health insurance, you don’t have \nto put off checkups, use the emergency room for everyday \nhealth needs, or go to community health clinics with long wait \ntimes. You can take care of your health today, instead of waiting \nuntil you have a health emergency. Thanks to the Affordable Care \nAct (ACA), millions of people can now afford health insurance.\nThe ACA also rules that everyone in the United States must have \nhealth insurance. If you don’t buy health insurance, you may \nhave to pay extra in your taxes as a penalty.\nwhy health insurance  \nis important1\nHealth insurance is a signed contract with a \nhealth insurance company that requires the \ncompany to pay for some of your health care \ncosts. \nThat doesn’t m

In [7]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

Create the LangChain embedding and LLM objects for later usage:

In [8]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  warn_deprecated(
  warn_deprecated(


Create your LangChain vector store using Astra DB!

In [9]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="usa_health_care_DB",
    session=None,
    keyspace=None,
)

In [10]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [11]:
texts[:500]

['Insurance\nhow insurance  \nworksHealth insurance is one of the best ways you can protect yourself \nand your family in case you get sick or injured and need medical \ncare. It also helps you get the regular medical and dental care \nyou need to stay healthy. With health insurance, you don’t have \nto put off checkups, use the emergency room for everyday \nhealth needs, or go to community health clinics with long wait \ntimes. You can take care of your health today, instead of waiting \nuntil you have a health emergency. Thanks to the Affordable Care \nAct (ACA), millions of people can now afford health insurance.\nThe ACA also rules that everyone in the United States must have \nhealth insurance. If you don’t buy health insurance, you may \nhave to pay extra in your taxes as a penalty.',
 'The ACA also rules that everyone in the United States must have \nhealth insurance. If you don’t buy health insurance, you may \nhave to pay extra in your taxes as a penalty.\nwhy health insurance

### Create vector store

In [12]:

astra_vector_store.add_texts(texts[:800])

print("Inserted %i headlines." % len(texts[:800]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 244 headlines.
