In [1]:
#Installing the necessary libraries
%pip install cassio datasets langchain openai tiktoken flask

Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import SimpleStatement

from datasets import load_dataset

import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Defining the different tokens

ASTRA_DB_SECURE_BUNDLE_PATH = "D:\\Users\\Julien\\Documents\\developpement\\python\\embedding-knowledge\\secure-connect-vector-database.zip"
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:QMnMDNSekBwcZCCHRmaeqZFI:b2e2fa8e9ca207fcb4e6c35cbbf0be3975c208eb8d20dbfc45259a61bbd25e36"
ASTRA_DB_CLIENT_ID = "QMnMDNSekBwcZCCHRmaeqZFI"
ASTRA_DC_CLIENT_SECRET = "_SPYpZuo2ZY5zaANNScG62eWgUBZBtoeX.+2tkAmuCo,k730O5nAWL1PxhE-GMCFY2APc,T.NBReKAl7sZ_XGl7YZOtRNRe+_PwvKAzjmDRgdA7I0mY0Qgd,GeABI8,_"
ASTRA_DB_KEYSPACE = "search"
OPENAI_API_KEY = os.getenv('OPENAI_KEY')

In [4]:
#Creating the Vector Store

cloud_config={
    'secure_connect_bundle': ASTRA_DB_SECURE_BUNDLE_PATH
}
auth_provider = PlainTextAuthProvider(ASTRA_DB_CLIENT_ID,ASTRA_DC_CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
astraSession = cluster.connect()

llm = OpenAI(openai_api_key = OPENAI_API_KEY)
myEmbedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

MyCassandraVStore = Cassandra(
    embedding=myEmbedding,
    session=astraSession,
    keyspace=ASTRA_DB_KEYSPACE,
    table_name="embedding_demo",
)

In [5]:
#Deleting everything from astraDB

query = SimpleStatement("TRUNCATE {}.{}".format(ASTRA_DB_KEYSPACE, "embedding_demo"))
astraSession.execute(query)

<cassandra.cluster.ResultSet at 0x1612acd6b00>

In [6]:
#Loading the data

print("Loading data from huggingface")
mydataset = load_dataset("Biddls/Onion_news", split="train")
headlines = mydataset["text"][:50]

Loading data from huggingface


In [7]:
#Generating the Embedding in AstraDB

print("\nGenerating Embedding and storing in AstraDB")
MyCassandraVStore.add_texts(headlines)

print("Insert %i headlines.\n" % len(headlines))

vectorIndex = VectorStoreIndexWrapper(vectorstore=MyCassandraVStore)


Generating Embedding and storing in AstraDB
Insert 50 headlines.



In [8]:
#View the content of the Vector store

query = SimpleStatement("SELECT * FROM {}.{}".format(ASTRA_DB_KEYSPACE, "embedding_demo"))
rows = astraSession.execute(query)

for row in rows:
    print(row)

Row(row_id='662313f9c5f643448889bad0bed603d5', attributes_blob=None, body_blob='Everything Jair Bolsonaro Has Done While In Exile In Orlando #~# Far-right Brazilian president Jair Bolsonaro fled to the United States after leading an unsuccessful campaign to overturn the results of the presidential last election. Here is everything Bolsonaro has done while in exile in Orlando, FL.', metadata_s=None, vector=[-0.03847752884030342, -0.01399422436952591, 0.014995695091784, 0.000758514681365341, -0.0036764489486813545, 0.007570322137326002, -0.021360300481319427, -0.022177288308739662, 0.009665503166615963, 0.005656328517943621, 0.02867366559803486, -0.010970049537718296, -0.03404998034238815, 0.01547007542103529, 0.013012520968914032, -0.0051193563267588615, 0.027883032336831093, -0.026235876604914665, 0.005363135132938623, -0.016128938645124435, -0.01133901160210371, 0.008677210658788681, -0.020319297909736633, 0.025287115946412086, 0.017789268866181374, -0.02063555270433426, 0.00131772365

In [9]:
"""#Question loop on the embedded llm

while True:
    query_text = input("\nEnter a question (or type 'quit' to exit):")

    if query_text.lower() == "quit":
        break

    print("QUESTION: \"%s\"\n" % query_text)

    # Answer without embeddings
    answer_without_embeddings = llm.generate([query_text])
    print("ANSWER WITHOUT EMBEDDINGS: \"%s\"\n" % answer_without_embeddings)

    # Answer with embeddings
    answer_with_embeddings  = vectorIndex.query(query_text, llm=llm).strip()
    print("ANSWER WITH EMBEDDINGS: \"%s\"\n" % answer_with_embeddings)

    #Mots relevant documents
    print("DOCUMENTS BY RELEVANCE:")
    for doc, score in MyCassandraVStore.similarity_search_with_score(query_text, k=4):
        print("  %0.4f \"%s ...\"" % (score, doc.page_content[:60]))
        """

'#Question loop on the embedded llm\n\nwhile True:\n    query_text = input("\nEnter a question (or type \'quit\' to exit):")\n\n    if query_text.lower() == "quit":\n        break\n\n    print("QUESTION: "%s"\n" % query_text)\n\n    # Answer without embeddings\n    answer_without_embeddings = llm.generate([query_text])\n    print("ANSWER WITHOUT EMBEDDINGS: "%s"\n" % answer_without_embeddings)\n\n    # Answer with embeddings\n    answer_with_embeddings  = vectorIndex.query(query_text, llm=llm).strip()\n    print("ANSWER WITH EMBEDDINGS: "%s"\n" % answer_with_embeddings)\n\n    #Mots relevant documents\n    print("DOCUMENTS BY RELEVANCE:")\n    for doc, score in MyCassandraVStore.similarity_search_with_score(query_text, k=4):\n        print("  %0.4f "%s ..."" % (score, doc.page_content[:60]))\n        '

In [10]:
#expose a webservice instead

import threading
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/ask', methods=['POST'])
def ask():
    query_text = request.json['question']

    # Answer without embeddings
    answer_without_embeddings = llm.generate([query_text])
    answer_without_embeddings = answer_without_embeddings.generations[0][0].text.strip()

    # Answer with embeddings
    answer_with_embeddings  = vectorIndex.query(query_text, llm=llm).strip()

    # Documents by relevance
    documents_by_relevance = [
        {"score": score, "content": doc.page_content[:60]}
        for doc, score in MyCassandraVStore.similarity_search_with_score(query_text, k=4)
    ]

    return jsonify({
        "question": query_text,
        "answer_without_embeddings": answer_without_embeddings,
        "answer_with_embeddings": answer_with_embeddings,
        "documents_by_relevance": documents_by_relevance,
    })

def run_flask():
    app.run(host='192.168.0.10', port=5000)

threading.Thread(target=run_flask).start()

[33m * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.[0m


 * Serving Flask app '__main__'


 * Debug mode: off


 * Running on http://192.168.0.10:5000
Press CTRL+C to quit
93.27.52.130 - - [04/Jan/2024 15:58:06] "POST /ask HTTP/1.1" 400 -
93.27.52.130 - - [04/Jan/2024 16:00:44] "POST /ask HTTP/1.1" 200 -
93.27.52.130 - - [04/Jan/2024 16:07:20] "POST / HTTP/1.1" 404 -
93.27.52.130 - - [04/Jan/2024 16:08:15] "POST /?"question"="do%20you%20have%20information%20about%20Ukraine%20and%20psychomarines%20?" HTTP/1.1" 404 -
93.27.52.130 - - [04/Jan/2024 16:08:47] "POST /?{\\"question\\":\\"do%20you%20have%20information%20about%20Ukraine%20and%20psychomarines%20?\\"} HTTP/1.1" 404 -
93.27.52.130 - - [04/Jan/2024 16:13:16] "POST /ask HTTP/1.1" 415 -
93.27.52.130 - - [04/Jan/2024 16:19:18] "POST /ask HTTP/1.1" 200 -
93.27.52.130 - - [04/Jan/2024 16:22:09] "POST /ask HTTP/1.1" 200 -
