In [1]:
from dotenv import load_dotenv

load_dotenv('../../../.env')

True

# Download Text

In [2]:
!mkdir -p 'data/'
!curl 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -o 'data/paul_graham_essay.txt'

The syntax of the command is incorrect.
curl: (3) URL using bad/illegal format or missing URL


# Create Llama-index nodes/chunks

In [3]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("data/").load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)

# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes):
    node.id_ = f"node_{idx}"

print(f"Number of Documents: {len(documents)}")
print(f"Number of nodes: {len(nodes)} with the current chunk size of {node_parser.chunk_size}")

Number of Documents: 1
Number of nodes: 57 with the current chunk size of 512


# Create local Deep Lake Vector store

In [4]:
from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer

llm = HuggingFaceInferenceAPI(
    model_name='mistralai/Mistral-7B-Instruct-v0.2',
    generate_kwargs={'temperature':0.5,"max_length": 64,"max_new_tokens":512}
)

set_global_tokenizer(
    AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha").encode
)




In [4]:

# Create a DeepLakeVectorStore locally to store the vectors
dataset_path = "data/deep_lake_db"
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)

service_context = ServiceContext.from_defaults(embed_model="local", llm=llm,)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

vector_index = VectorStoreIndex(nodes, service_context=service_context, storage_context=storage_context, show_progress=True)

  service_context = ServiceContext.from_defaults(embed_model="local", llm=llm,)


Generating embeddings:   0%|          | 0/57 [00:00<?, ?it/s]

Uploading data to deeplake dataset.


100%|██████████| 57/57 [00:00<00:00, 383.65it/s]


Dataset(path='data/deep_lake_db', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (57, 1)     str     None   
 metadata     json      (57, 1)     str     None   
 embedding  embedding  (57, 384)  float32   None   
    id        text      (57, 1)     str     None   




# Upload local Vector Database to ActiveLoop's Platform and convert to managed database

In [5]:
import deeplake
local = "./data/deep_lake_db"
hub_path = "hub://thapabibek1129/optimization_paul_graham"
hub_managed_path = "hub://thapabibek1129/optimization_paul_graham_managed"

# First upload our local vector store
deeplake.deepcopy(local, hub_path, overwrite=True)
# Create a managed vector store under a different name
deeplake.deepcopy(hub_path, hub_managed_path, overwrite=True, runtime={"tensor_db": True})

Copying dataset: 96%|█████████▋| 27/28 [00:19<00:00


This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/thapabibek1129/optimization_paul_graham
Your Deep Lake dataset has been successfully created!


Copying dataset: 96%|█████████▋| 27/28 [01:06<00:02


This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/thapabibek1129/optimization_paul_graham_managed
Your Deep Lake dataset has been successfully created!


Dataset(path='hub://thapabibek1129/optimization_paul_graham_managed', tensors=['embedding', 'id', 'metadata', 'text'])

# Instantiate a Vector Store

In [6]:
db = DeepLakeVectorStore(dataset_path=hub_managed_path, overwrite=False, read_only=True,)

Deep Lake Dataset in hub://thapabibek1129/optimization_paul_graham_managed already exists, loading from the storage


In [9]:
# Fetch dataset docs and ids 
docs = db.vectorstore.dataset.text.data(fetch_chunks=True, aslist=True)['value']
ids = db.vectorstore.dataset.id.data(fetch_chunks=True, aslist=True)['value']
print(len(docs))

AttributeError: 'DeepLakeVectorStore' object has no attribute 'vectorstore'

# Fetch Dataset docs and ids

In [7]:
ds = deeplake.load('hub://thapabibek1129/optimization_paul_graham_managed')

|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/thapabibek1129/optimization_paul_graham_managed



|

hub://thapabibek1129/optimization_paul_graham_managed loaded successfully.



 

In [8]:
from langchain.vectorstores import DeepLake
vector_store = DeepLake(dataset_path='hub://thapabibek1129/optimization_paul_graham_managed', read_only=True, verbose=False)

Deep Lake Dataset in hub://thapabibek1129/optimization_paul_graham_managed already exists, loading from the storage


In [None]:
docs = vector_store.vectorstore.text.data()['value']
ids = vector_store.vectorstore.id.data()['value']

AttributeError: 'VectorStore' object has no attribute 'text'

In [None]:
vector_store.vectorstore.__dict__

{'dataset_handler': <deeplake.core.vectorstore.dataset_handlers.client_side_dataset_handler.ClientSideDH at 0x27b0b7ca9d0>,
 'deep_memory': <deeplake.core.vectorstore.deep_memory.deep_memory.DeepMemory at 0x27b0b2a17c0>}

# Generating Synthetic training dataset

In [None]:
from huggingface_hub import H
client = HfApi()
def generate_question(text):
    try:
        response = client.chat.completions.create(
            model_name='mistralai/Mistral-7B-Instruct-v0.2',
            messages=[
                {"role": "system", "content": "You are a world class expert for generating questions based on provided context. \
                        You make sure the question can be answered by the text."},
                {
                    "role": "user",
                    "content": text,
                },
            ],
        )
        return response.choices[0].message.content
    except Exception as e:
        print(e)
        question_string = "No question generated"
        return question_string

In [None]:
generate_question("hi")

'HfApi' object has no attribute 'chat'


'No question generated'

In [None]:
import random
from tqdm import tqdm

def generate_queries(docs: list[str], ids: list[str], n: int):

    questions = []
    relevances = []
    pbar = tqdm(total=n)
    while len(questions) < n:
        # 1. randomly draw a piece of text and relevance id
        r = random.randint(0, len(docs)-1)
        text, label = docs[r], ids[r]

        # 2. generate queries and assign and relevance id
        generated_qs = [generate_question(text)]
        if generated_qs == ["No question generated"]:
            print("No question generated")
            continue

        questions.extend(generated_qs)
        relevances.extend([[(label, 1)] for _ in generated_qs])
        pbar.update(len(generated_qs))

    return questions[:n], relevances[:n]

DeepLakeVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, ingestion_batch_size=1024, num_workers=4, token=None, read_only=True, dataset_path='hub://thapabibek1129/optimization_paul_graham_managed')

# Launch Query Generation Process

In [None]:
questions, relevances = generate_queries(docs, ids, n=40)
print(len(questions)) #40
print(questions[0])

# Launch Deep Memory Training

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
openai_embeddings = OpenAIEmbeddings()

job_id = db.vectorstore.deep_memory.train(
    queries=questions,
    relevance=relevances,
    embedding_function=openai_embeddings.embed_documents,
)

# Start Deep Memory Training Job

In [None]:
# During training you can check the status of the training run
db.vectorstore.deep_memory.status(job_id="657b3083d528b0fd224173c6")

# Run Deep Memory-enabled inference by setting deep_memory=True.

In [None]:
from llama_index.llms import OpenAI
query = "What are the main things Paul worked on before college?"

llm = OpenAI(model="gpt-3.5-turbo-1106")
embed_model = OpenAIEmbedding()

service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm,)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

db = DeepLakeVectorStore(dataset_path=hub_managed_path, overwrite=False, read_only=True,)
vector_index = VectorStoreIndex.from_vector_store(db, service_context=service_context, storage_context=storage_context, show_progress=True)

query_engine = vector_index.as_query_engine(similarity_top_k=3, vector_store_kwargs={"deep_memory": True})
response_vector = query_engine.query(query)
print(response_vector.response)

# Now, let's run a quantitative evaluation on another set of synthetically generated test queries.

In [None]:
# Generate validation queries
validation_questions, validation_relevances = generate_queries(docs, ids, n=40)

# Launch the evaluation function
recalls = db.vectorstore.deep_memory.evaluate(
    queries=validation_questions,
    relevance=validation_relevances,
    embedding_function=openai_embeddings.embed_documents,
)