In [None]:
%pip install cassio datasets langchain openai tiktoken langchain-community PyPDF2

In [None]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

from datasets import load_dataset

import cassio

from PyPDF2 import PdfReader

from typing_extensions import Concatenate

In [6]:
ASTRA_DB_APPLICATION_TOKEN = "ASTRA_DB_APPLICATION_TOKEN"
ASTRA_DB_ID = "ASTRA_DB_ID"

OPENAI_API_KEY = "OPENAI_API_KEY"

In my example: SOURCES_AND_SCOPE_OF_EUROPEAN_UNION_LAW.pdf

In [7]:
pdfreader = PdfReader('PDF_FILE.pdf')

In [9]:
raw_text = ""

In [10]:
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [12]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [None]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [17]:
text_spliiter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len,
)

In [18]:
texts = text_spliiter.split_text(raw_text)

In [None]:
texts

In [None]:
astra_vector_store.add_texts(texts)

print("Inserted %i headlines." % len(texts))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

QUESTION: Involvement of independent research needs to be?

In [None]:
first_question = True

while True:
    if first_question:
        query_text = input("\n Enter your question (ortype 'quit' to exit): ").strip()
    else:
        query_text = input("\n What's your next question (ortype 'quit' to exit): ").strip() 

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\n QUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("     [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))