In [2]:
!pip install chromadb llama-index llama-index-vector-stores-chroma fastembed transformers huggingface_hub torch gradio

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting llama-index
  Downloading llama_index-0.12.25-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-vector-stores-chroma
  Downloading llama_index_vector_stores_chroma-0.4.1-py3-none-any.whl.metadata (696 bytes)
Collecting fastembed
  Downloading fastembed-0.6.0-py3-none-any.whl.metadata (9.9 kB)
Collecting gradio
  Downloading gradio-5.22.0-py3-none-any.whl.metadata (16 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting 

In [4]:
!pip install llama-index-embeddings-fastembed

Collecting llama-index-embeddings-fastembed
  Downloading llama_index_embeddings_fastembed-0.3.0-py3-none-any.whl.metadata (697 bytes)
Downloading llama_index_embeddings_fastembed-0.3.0-py3-none-any.whl (2.7 kB)
Installing collected packages: llama-index-embeddings-fastembed
Successfully installed llama-index-embeddings-fastembed-0.3.0


In [7]:
!pip install chromadb \
    llama-index \
    llama-index-vector-stores-chroma \
    llama-index-embeddings-fastembed \
    llama-index-llms-huggingface \
    fastembed \
    transformers \
    huggingface_hub \
    torch \
    gradio

Collecting llama-index-llms-huggingface
  Downloading llama_index_llms_huggingface-0.4.2-py3-none-any.whl.metadata (2.9 kB)
Collecting text-generation<0.8.0,>=0.7.0 (from llama-index-llms-huggingface)
  Downloading text_generation-0.7.0-py3-none-any.whl.metadata (8.5 kB)
Downloading llama_index_llms_huggingface-0.4.2-py3-none-any.whl (11 kB)
Downloading text_generation-0.7.0-py3-none-any.whl (12 kB)
Installing collected packages: text-generation, llama-index-llms-huggingface
Successfully installed llama-index-llms-huggingface-0.4.2 text-generation-0.7.0


In [10]:
import logging
import sys
import torch
import chromadb
from huggingface_hub import notebook_login
from transformers import AutoTokenizer

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore

In [11]:
# Set up logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [12]:
# Load documents from a directory
documents = SimpleDirectoryReader("/content/Data").load_data()

# Initialize embedding model (FastEmbed)
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model
Settings.chunk_size = 512  # Chunking for retrieval efficiency

# Define system prompt & query wrapper
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

In [13]:
# Authenticate Hugging Face (if needed)
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
# Load tokenizer for Meta-Llama-3-8B-Instruct
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# Configure LLM
llm = HuggingFaceLLM(
    context_window=8192,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
    device_map="auto",
    stopping_ids=stopping_ids,
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": torch.float16}  # Use float16 for efficiency
)
Settings.llm = llm

# Set up ChromaDB for persistent vector storage
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Store vectors persistently
vector_store = ChromaVectorStore(chroma_collection=chroma_client.get_or_create_collection("my_collection"))
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Build vector index with ChromaDB
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

# Create query engine
query_engine = index.as_query_engine()

# Define chatbot function
def predict(input, history):
    response = query_engine.query(input)
    return str(response)

# Deploy chatbot with Gradio
import gradio as gr
gr.ChatInterface(predict).launch(share=True)


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.


  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://469961061b0e2e9907.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


