## This is a sample streamlit app

In [5]:
%pip install -q cassio datasets langchain langchain_community openai tiktoken

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [28]:
!pip install PyPDF2



In [27]:
from PyPDF2 import PdfReader

In [18]:
# !wget https://redis.io/wp-content/uploads/2021/12/caching-at-scale-with-redis-updated-2021-12-04.pdf

In [19]:
# Langchain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings


# Support dataset retrieval from huggingface
from datasets import load_dataset

# Initialize DB connection to langchain
import cassio

In [None]:

from google.colab import userdata
ASTRA_DB_APPLICATION_TOKEN = userdata.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_ID = userdata.get('ASTRA_DB_ID')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')



In [48]:
pdfreader = PdfReader('/content/redis.pdf')

In [49]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [None]:
raw_text

In [51]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [52]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [53]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [55]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [56]:
# texts[:50]

In [57]:
astra_vector_store.add_texts(texts)

print("Inserted %i headlines." % len(texts))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 197 headlines.


In [58]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


Enter your question (or type 'quit' to exit): what is redis

QUESTION: "what is redis"




ANSWER: "Redis is an in-memory database and Database-as-a-Service (DBaaS) that is consistently ranked as a leader in top analyst reports on NoSQL, operational databases, and Database-as-a-Service (DBaaS). It is trusted by thousands of enterprise customers and is used for high-speed transactions, job and queue management, user session stores, and real-time data ingest. It is available in public and private clouds, as downloadable software, in containers, and for hybrid cloud/on-premises deployments."

FIRST DOCUMENTS BY RELEVANCE:




    [0.9109] "Redis, consistently ranked as a leader in top analyst reports on NoSQL, in-
memory d ..."
    [0.9109] "Redis, consistently ranked as a leader in top analyst reports on NoSQL, in-
memory d ..."
    [0.8995] "means it is fast. Redis is often used as a cache frontend for some other, slower but ..."
    [0.8993] "means it is fast. Redis is often used as a cache frontend for some other, slower but ..."

What's your next question (or type 'quit' to exit): quit
