In [1]:
%%writefile requirements.txt
llama-index
llama-index-llms-huggingface
llama-index-embeddings-huggingface
chromadb
llama-index-vector-stores-chroma
llama-index-llms-groq
einops
accelerate
sentence-transformers
llama-index-llms-mistralai
llama-index-embeddings-mistralai
chainlit


Writing requirements.txt


In [2]:
!pip install -r requirements.txt

Collecting llama-index (from -r requirements.txt (line 1))
  Downloading llama_index-0.10.53-py3-none-any.whl (6.8 kB)
Collecting llama-index-llms-huggingface (from -r requirements.txt (line 2))
  Downloading llama_index_llms_huggingface-0.2.4-py3-none-any.whl (11 kB)
Collecting llama-index-embeddings-huggingface (from -r requirements.txt (line 3))
  Downloading llama_index_embeddings_huggingface-0.2.2-py3-none-any.whl (7.2 kB)
Collecting chromadb (from -r requirements.txt (line 4))
  Downloading chromadb-0.5.3-py3-none-any.whl (559 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-vector-stores-chroma (from -r requirements.txt (line 5))
  Downloading llama_index_vector_stores_chroma-0.1.10-py3-none-any.whl (5.0 kB)
Collecting llama-index-llms-groq (from -r requirements.txt (line 6))
  Downloading llama_index_llms_groq-0.1.4-py3-none-any.whl (2.9 kB)
Collecting einops (from -r requ

In [3]:
!pip install --upgrade huggingface_hub



In [5]:
!pip install numpy==1.23.5

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chainlit 1.1.306 requires numpy<2.0,>=1.26; python_version >= "3.9", but you have numpy 1.23.5 which is incompatible.
chex 0.1.86 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
pandas-stubs 2.0.3.230814 requires numpy>=1.25.0; python_version >= "3.9", but you have numpy 1.23.5 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.23.5


In [1]:
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,SummaryIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool,QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters,FilterCondition
from typing import List,Optional

#Define embedding model and LLM


In [9]:
import json
#from llama_index.embeddings.huggingface import HuggingFaceEmbedding
#from llama_index.core import Settings
from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
from transformers import AutoModel, AutoTokenizer
import torch

class HuggingFaceEmbedding:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def get_text_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze()

    def get_embeddings(self, texts: list[str], batch_size: int = 16) -> list[torch.Tensor]:
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = self.tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.extend(batch_embeddings.cpu())
        return embeddings

embed_model =  HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

llm = MistralAI(model = 'mistral-large-latest', api_key = "H1rVHKqTjonaHi4vOZ8jozgwtrhRiSjZ")

#Read file

In [5]:
documents = SimpleDirectoryReader(input_files=['Harry Potter The Complete Collection.pdf']).load_data(num_workers=4)




#Chunk the document

In [6]:
splitter = SentenceSplitter(chunk_size=1024,chunk_overlap=100)
nodes = splitter.get_nodes_from_documents(documents)
print(f"Length of nodes : {len(nodes)}")
print(f"get the content for node 0 :{nodes[0].get_content(metadata_mode='all')}")

Length of nodes : 3623
get the content for node 0 :page_label: 1
file_name: Harry Potter The Complete Collection.pdf
file_path: Harry Potter The Complete Collection.pdf
file_type: application/pdf
file_size: 17338552
creation_date: 2024-07-09
last_modified_date: 2024-07-09


#Define relevant contexts


In [10]:
from llama_index.core import ServiceContext, StorageContext

service_context = ServiceContext.from_defaults(llm = llm,
                                               embed_model = embed_model)
                                               #callback_manager = CallbackManager([cl.LlamaIndexCallbackHandler()])


  service_context = ServiceContext.from_defaults(llm = llm,


#Alternative 1: Black-box retriever

##Create Vector Store Index

In [11]:
vector_index = VectorStoreIndex.from_documents(documents,
                                               #storage_context=storage_context
                                               service_context=service_context,
                                               node_parser=nodes,
                                               show_progress=True)

vector_index.storage_context.persist(persist_dir="./storage")

Parsing nodes:   0%|          | 0/3623 [00:00<?, ?it/s]

AttributeError: 'HuggingFaceEmbedding' object has no attribute 'get_text_embedding_batch'

##Create query engine

In [None]:
query_engine = vector_index.as_query_engine(similarity_top_k = 3,
                                            service_context=service_context)

response = query_engine.query("Who is the main character?")
print(str(response))

The main character is a boy named Harry Potter, although he is not directly mentioned in the provided context. The context refers to the Dursleys, who have a secret related to the Potters, and it is mentioned that the Potters have a small son. The file path "Harry Potter The Complete Collection.pdf" also suggests that the main character is Harry Potter.


##Deploy the engine on Chainlit

In [31]:
%%writefile app.py
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core import load_index_from_storage, StorageContext, ServiceContext
from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
import chainlit as cl

@cl.on_chat_start
async def start_chat():
  embed_model = MistralAIEmbedding(model_name="mistral-embed", api_key="H1rVHKqTjonaHi4vOZ8jozgwtrhRiSjZ")

  llm = MistralAI(model = 'mistral-large-latest', api_key = "H1rVHKqTjonaHi4vOZ8jozgwtrhRiSjZ")
  storage_context = StorageContext.from_defaults(persist_dir="./storage")

  service_context = ServiceContext.from_defaults(llm = llm,
                                                 embed_model = embed_model,
                                                 callback_manager = CallbackManager([cl.LlamaIndexCallbackHandler()]))

  index = load_index_from_storage(
      service_context = service_context,
      storage_context = storage_context,
  )

  query_engine = index.as_query_engine(similarity_top_k = 3,
                                       service_context = service_context)

  cl.user_session.set("chatbot", query_engine)

  await cl.Message(
      author="Assistant", content="Hello! What do you want to know about Harry Potter?"
  ).send()

@cl.on_message
async def main(message: cl.Message):
  query_engine = cl.user_session.get("chatbot")

  response = await cl.make_async(query_engine.query)(message.content)

  response_message = cl.Message(content="", author="Assistant")

  for token in response.response:
    await response_message.stream_token(token=token)

  await response_message.send()

Writing app.py


In [32]:
!fuser -n tcp -k 8000


from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(8000)"))

https://z9ohyrwfn0d-496ff2e9c6d22116-8000-colab.googleusercontent.com/


In [33]:
!chainlit run app.py

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pydantic/_internal/_generate_schema.py", line 1134, in _common_field_schema
    schema = self._apply_annotations(
  File "/usr/local/lib/python3.10/dist-packages/pydantic/_internal/_generate_schema.py", line 1890, in _apply_annotations
    schema = get_inner_schema(source_type)
  File "/usr/local/lib/python3.10/dist-packages/pydantic/_internal/_schema_generation_shared.py", line 83, in __call__
    schema = self._handler(source_type)
  File "/usr/local/lib/python3.10/dist-packages/pydantic/_internal/_generate_schema.py", line 1871, in inner_handler
    schema = self._generate_schema_inner(obj)
  File "/usr/local/lib/python3.10/dist-packages/pydantic/_internal/_generate_schema.py", line 789, in _generate_schema_inner
    return self.match_type(obj)
  File "/usr/local/lib/python3.10/dist-packages/pydantic/_internal/_generate_schema.py", line 871, in match_type
    return self._match_generic_type(obj, origi

#Alternative 2: Custom Retriever


In [3]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

##Define top-k embedding retriever

In [12]:
def top_k_embed_retriever(query: str, k: int, embed_model, manual_embed_np, texts):
    query_embed = embed_model.get_text_embedding(query)
    query_embed_np = query_embed.cpu().numpy().reshape(1, -1)

    # Calculate cosine similarity with all the other embeddings
    similarities = cosine_similarity(query_embed_np, manual_embed_np).flatten()

    # Find the top k indices
    top_k_indices = np.argpartition(similarities, -k)[-k:]
    top_k_indices = top_k_indices[np.argsort(similarities[top_k_indices])[::-1]]

    # Retrieve the top k texts
    top_k_texts = [texts[i] for i in top_k_indices]

    return top_k_texts

##Create embeddings for each node

In [14]:
!pip install tqdm



In [15]:
from tqdm import tqdm

node_embeddings = []
node_texts = [node.get_text() for node in nodes]
for text in tqdm(node_texts):
    embed = embed_model.get_text_embedding(text)
    node_embeddings.append(embed.cpu().numpy())

node_embeddings_np = np.array(node_embeddings)


100%|██████████| 3623/3623 [30:23<00:00,  1.99it/s]


##Save embeddings for later use

In [16]:
np.save('node_embeddings.npy', node_embeddings_np)
with open('node_texts.json', 'w') as f:
    json.dump(node_texts, f)

##Find the relative passages to a query with the retriever



In [42]:
def custom_query_engine(query, k=3):
    results = top_k_embed_retriever(query, k, embed_model, node_embeddings_np, node_texts)
    return results

In [35]:
query = "Who is the main character?"
retrieved = custom_query_engine(query)

combined_text = ". ".join(retrieved)

##Define Prompt Format

In [36]:
prompt = f"""
Context information is below.
---------------------
{combined_text}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query}
Answer:
"""


##Deploy Mistral Client

In [37]:
from mistralai.models.chat_completion import ChatMessage
from mistralai.client import MistralClient

model = 'mistral-large-latest'
client = MistralClient(api_key="H1rVHKqTjonaHi4vOZ8jozgwtrhRiSjZ")

def generate_response(prompt):
  messages = [
      ChatMessage(role="user", content=prompt)
      ]
  chat_response = client.chat(
      model=model,
      messages=messages
  )

  return (chat_response.choices[0].message.content)

print(generate_response(prompt))


The main character in the provided text appears to be Harry. He is the one asking questions, interacting with various characters, and expressing his thoughts and feelings. The text also mentions other characters such as Hermione, Bathilda, Malfoy, Snape, Cedric, Cho, Fleur, and Krum, but Harry seems to be the central figure.


##Deploy model on chainlit using mistral client

In [49]:
%%writefile app_custom.py
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from mistralai.models.chat_completion import ChatMessage
from mistralai.client import MistralClient
import chainlit as cl
import json
#from llama_index.embeddings.huggingface import HuggingFaceEmbedding
#from llama_index.core import Settings
from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
from transformers import AutoModel, AutoTokenizer
import torch

class HuggingFaceEmbedding:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def get_text_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze()

    def get_embeddings(self, texts: list[str], batch_size: int = 16) -> list[torch.Tensor]:
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = self.tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.extend(batch_embeddings.cpu())
        return embeddings

embed_model =  HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

model = 'mistral-large-latest'
client = MistralClient(api_key="H1rVHKqTjonaHi4vOZ8jozgwtrhRiSjZ")

def top_k_embed_retriever(query: str, k: int, embed_model, manual_embed_np, texts):
    query_embed = embed_model.get_text_embedding(query)
    query_embed_np = query_embed.cpu().numpy().reshape(1, -1)

    # Calculate cosine similarity with all the other embeddings
    similarities = cosine_similarity(query_embed_np, manual_embed_np).flatten()

    # Find the top k indices
    top_k_indices = np.argpartition(similarities, -k)[-k:]
    top_k_indices = top_k_indices[np.argsort(similarities[top_k_indices])[::-1]]

    # Retrieve the top k texts
    top_k_texts = [texts[i] for i in top_k_indices]

    return top_k_texts

def custom_query_engine(query, k=3):
    node_embeddings_np = np.load('node_embeddings.npy')
    with open('node_texts.json', 'r') as f:
        node_texts = json.load(f)
    results = top_k_embed_retriever(query, k, embed_model, node_embeddings_np, node_texts)
    return results

def find_text(query):
  retrieved = custom_query_engine(query)

  combined_text = ". ".join(retrieved)

  return combined_text

def create_prompt(combined_text, query):
  prompt = f"""
    Context information is below.
    ---------------------
    {combined_text}
    ---------------------
    Given the context information and not prior knowledge, answer the query.
    Query: {query}
    Answer:
    """
  return prompt

@cl.on_chat_start
async def start_chat():
  welcome_message = cl.Message(content="Starting the chatbot...")
  await welcome_message.send()
  welcome_message.content = (
      "Hi, what do you want to know about Harry Potter?"
  )
  await welcome_message.update()
    #cl.user_session.set("chain", chatbot)

@cl.on_message
async def main(message: cl.Message):
  #chatbot = cl.user_session.get("chain")

  combined_text = find_text(message.content)
  prompt = create_prompt(combined_text, message.content)

  messages = [
    ChatMessage(role="user",
                content=prompt)
  ]
  chat_response = client.chat(
    model=model,
    messages=messages
  )

  await cl.Message(chat_response.choices[0].message.content).send()

Overwriting app_custom.py


In [47]:
!fuser -n tcp -k 8000


from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(8000)"))

https://z9ohyrwfn0d-496ff2e9c6d22116-8000-colab.googleusercontent.com/


In [50]:
!chainlit run app_custom.py

2024-07-09 16:00:21 - Your app is available at http://localhost:8000
2024-07-09 16:00:28 - Translated markdown file for en-US not found. Defaulting to chainlit.md.
2024-07-09 16:00:40 - HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"
