In [1]:
!pip install chromadb langchain tqdm llama-cpp-python transformers huggingface_hub numpy langchain-community

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.7.tar.gz (66.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.11-py3-none-a

In [3]:
import chromadb
from tqdm import tqdm  
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from llama_cpp import Llama
import json
from huggingface_hub import hf_hub_download

repo_id = "ranaweerahk/Neural_Navigators-qwen2.5-q4-gguf"
gguf_file = "qwen_finetuned_merged_q4.gguf"
downloaded_path = hf_hub_download(repo_id=repo_id, filename=gguf_file, local_dir="./downloaded_model")

model_path = "/kaggle/input/qwen-model/gguf/default/1/qwen_finetuned_merged_q4.gguf"
llm = Llama(model_path=downloaded_path, n_ctx=4096, verbose=False)

# Chroma setup
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection("rag_ai_research")

with open("/kaggle/input/langchain-docs/langchain_docs.json", "r") as f:
    loaded_docs_dict = json.load(f)
loaded_docs = [Document(page_content=d["page_content"]) for d in loaded_docs_dict]

# Create vector store
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = Chroma.from_documents(
    documents=loaded_docs,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

def truncate_context(context, max_chars=1000):
    return context[:max_chars] + "..." if len(context) > max_chars else context

def rag_generate(query):
    print("Retrieving documents...")
    with tqdm(total=3, desc="Retrieving", ncols=80) as pbar:
        retrieved_docs = vector_store.similarity_search(query, k=3)
        pbar.update(3)

    context = " ".join([doc.page_content for doc in retrieved_docs])
    context = truncate_context(context, max_chars=3000)

    print("Generating response...")
    with tqdm(total=1, desc="Generating", ncols=80) as pbar:
        prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
        outputs = llm(prompt, max_tokens=200, temperature=0.7)
        pbar.update(1)

    return outputs["choices"][0]["text"]

# Test
query = "What is DualPipe’s overlap strategy?"
print(rag_generate(query))

qwen_finetuned_merged_q4.gguf:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

llama_init_from_model: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Retrieving documents...


Retrieving: 100%|█████████████████████████████████| 3/3 [00:00<00:00, 99.95it/s]


Generating response...


Generating: 100%|█████████████████████████████████| 1/1 [00:57<00:00, 57.30s/it]

 DualPipe employs a bidirectional pipeline parallelism algorithm that achieves full overlap between forward and backward computation and communication phases. This overlap strategy helps reduce pipeline bubbles, improving the efficiency of training and inference processes. Specifically, the overlap strategy involves parallel execution of forward and backward chunks to ensure that computation and communication phases are fully overlapped, thereby maximizing resource utilization and reducing unnecessary delays or "bubbles" in the pipeline. In the context of the document, the overlap strategy is detailed in terms of forward (F) and backward (B) chunks, where the forward and backward computations are executed simultaneously within the pipeline. The document also notes that the overlap strategy is further optimized for inferences by employing a microbatch strategy for computation-alltoall communication, ensuring efficient parallelism and load balancing, especially during decoding phases. Du


