# Question Answering

##### run "pip install -r requirements.txt"

# Setup

In [2]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

from datasets import load_dataset  # support for dataset retrieval with Hugging Face
import os
import cassio  # to integrate with Astra DB
from PyPDF2 import PdfReader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
ASTRA_DB_ID = os.environ["ASTRA_DB_ID"]

In [4]:
pdfreader = PdfReader("./data.pdf")

In [5]:
from typing_extensions import Concatenate

# read text from pdf
raw_text = ""
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

raw_text

' \nData science is an interdisciplinary academic ﬁeld[1] that uses statistics, scientiﬁc computing, \nscientiﬁc methods, processing, scientiﬁc visualization, algorithms and systems to extract or \nextrapolate knowledge and insights from potentially noisy, structured, or unstructured data.[2]  \n \nData science also integrates domain knowledge from the underlying application domain (e.g., \nnatural sciences, information technology, and medicine).[3]  \nData science is multifaceted and can be described as a science, a research paradigm, a research \nmethod, a discipline, a workﬂow, and a profession.[4]  \n \nData science is "a concept to unify statistics, data analysis, informatics, and their related methods" \nto "understand and analyze actual phenomena"  \nwith data.[5] It uses techniques and theories drawn from many ﬁelds within the context of \nmathematics, statistics, computer science, information science,  \nand domain knowledge.[6] However, data science is different from computer

In [6]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

### Create the langchain embedding and LLM objects : - these embeddings are needed for the model to understand the text

# Testing embedding

In [7]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os

GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("hello, world!")
vector[:5]

[0.05168594419956207,
 -0.030764883384108543,
 -0.03062233328819275,
 -0.02802734263241291,
 0.01813093200325966]

In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import GoogleGenerativeAI

import os

llm = GoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GOOGLE_API_KEY)
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# print(
#     llm.invoke(
#         "What are some of the pros and cons of Python as a programming language?"
#     )
# )
len(embedding.embed_query("hello, world!")), embedding.embed_query("hello, world!")

(768,
 [0.05168594419956207,
  -0.030764883384108543,
  -0.03062233328819275,
  -0.02802734263241291,
  0.01813093200325966,
  -0.0018945280462503433,
  0.028477225452661514,
  -0.007562300190329552,
  0.011064725928008556,
  -0.005353901535272598,
  0.03884560614824295,
  0.05086332932114601,
  -0.008304029703140259,
  -0.06084785610437393,
  -0.006586700677871704,
  -0.022183919325470924,
  0.013262641616165638,
  -0.008496989496052265,
  -0.0006867735064588487,
  -0.003296870505437255,
  -0.0034330044873058796,
  0.00962158851325512,
  -0.02791912667453289,
  -0.030381329357624054,
  0.021061237901449203,
  0.011688419617712498,
  -0.0009537383448332548,
  -0.07332165539264679,
  0.012589256279170513,
  0.05902666226029396,
  -0.03533710539340973,
  0.017094852402806282,
  -0.05426131561398506,
  0.006611739285290241,
  0.03894328698515892,
  -0.05709062144160271,
  0.03994591161608696,
  0.009693781845271587,
  -0.001833937130868435,
  -4.905453897663392e-05,
  0.024423865601420403

### Langchain vector store: - to store the document

In [9]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [10]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=10,
    length_function=len,
)

texts = text_splitter.split_text(raw_text)
texts[:5]

['Data science is an interdisciplinary academic ﬁeld[1] that uses statistics, scientiﬁc computing, \nscientiﬁc methods, processing, scientiﬁc visualization, algorithms and systems to extract or \nextrapolate knowledge and insights from potentially noisy, structured, or unstructured data.[2]  \n \nData science also integrates domain knowledge from the underlying application domain (e.g., \nnatural sciences, information technology, and medicine).[3]  \nData science is multifaceted and can be described as a science, a research paradigm, a research \nmethod, a discipline, a workﬂow, and a profession.[4]  \n \nData science is "a concept to unify statistics, data analysis, informatics, and their related methods" \nto "understand and analyze actual phenomena"',
 'with data.[5] It uses techniques and theories drawn from many ﬁelds within the context of \nmathematics, statistics, computer science, information science,  \nand domain knowledge.[6] However, data science is different from computer 

In [12]:
astra_vector_store.add_texts(texts)
print("Inserted %i headlines." % len(texts))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 5 headlines.


# Run the QA cycle

In [13]:
first_question = True

while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input(
            "\nWhat is your next question? (or type 'quit' to exit): "
        ).strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print('\nQUESTION: "%s"' % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print('ANSWER: \%s ..."\n' % answer)

    print("FIRST DOCUMENT BY RELEVANCE: ")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print('     [%0.4f] "%s .."' % (score, doc.page_content[:84]))


QUESTION: "what is data science"
ANSWER: \Data science is an interdisciplinary field focused on extracting knowledge from large datasets and applying that knowledge to solve problems in various application domains.  It involves preparing data for analysis, formulating problems, analyzing data, developing data-driven solutions, and presenting findings to inform high-level decisions.  The field incorporates skills from computer science, statistics, information science, mathematics, and various visualization techniques. ..."

FIRST DOCUMENT BY RELEVANCE: 
     [0.8548] "for a lot less money and in a lot less time. Smaller firms now have simpler
access t .."
     [0.8548] "for a lot less money and in a lot less time. Smaller firms now have simpler
access t .."
     [0.8543] "Data science is an interdisciplinary ﬁeld[10] focused on extracting knowledge from t .."
     [0.8543] "Data science is an interdisciplinary ﬁeld[10] focused on extracting knowledge from t .."


# Experimental

![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)