In [1]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import Groq
from langchain.embeddings import OpenAIEmbeddings  # Switch if using a different embedding model
from datasets import load_dataset
import cassio
from PyPDF2 import PdfReader

# Secrets for Astra DB and Groq
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:RJiHBlqgkeQpWjwjYNTEunoi:0878304bf3012b4b65478929ce8c4dc59de07b2793daa349900dd70fe114419c"
ASTRA_DB_ID = "b2db8230-8846-4de4-8efa-0db38bda78b4"
GROQ_API_KEY = "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"

# Extract text from PDF
pdfreader = PdfReader('temp.pdf')
raw_text = ''
for page in pdfreader.pages:
    content = page.extract_text()
    if content:
        raw_text += content

# Initialize connection to Astra DB
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

# Create LangChain LLM and embeddings
llm = Groq(api_key=GROQ_API_KEY)  # Groq LLM
embedding = OpenAIEmbeddings(openai_api_key=GROQ_API_KEY)  # Change if switching embedding models

# Create LangChain vector store backed by Astra DB
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

# Split the text into chunks
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

# Load the dataset into the vector store
astra_vector_store.add_texts(texts[:50])
print(f"Inserted {len(texts[:50])} chunks of text.")

# Create vector store index
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

# QA loop
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    # Query the vector index using Groq LLM
    print(f"\nQUESTION: \"{query_text}\"")
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print(f"ANSWER: \"{answer}\"\n")

    # Display documents by relevance
    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print(f"    [{score:.4f}] \"{doc.page_content[:84]}...\"")


ImportError: cannot import name 'Groq' from 'langchain.llms' (c:\Users\harsh\Projects\Python\AIML\.conda\Lib\site-packages\langchain\llms\__init__.py)

In [None]:
import os
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import cassio

# Load environment variables from the .env file
load_dotenv()

# Fetch credentials from environment variables
ASTRA_DB_APPLICATION_TOKEN = os.getenv('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_ID = os.getenv('ASTRA_DB_ID')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Initialize Cassandra with Astra DB
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

# Initialize LLM and Embeddings
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Initialize the Cassandra-backed vector store
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

# Read the PDF
pdfreader = PdfReader('temp.pdf')
raw_text = ''
for page in pdfreader.pages:
    content = page.extract_text()
    if content:
        raw_text += content

# Split text into chunks for indexing
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

# Insert the split texts into the vector store
astra_vector_store.add_texts(texts[:50])
print(f"Inserted {len(texts[:50])} chunks of text.")

# Wrap the vector store in an index wrapper
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

# Question answering loop
first_question = True
while True:
    query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if not query_text:
        continue

    first_question = False

    print(f"\nQUESTION: \"{query_text}\"")
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print(f"ANSWER: \"{answer}\"\n")

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print(f"    [{score:.4f}] \"{doc.page_content[:84]}...\"")
