In [None]:
!apt-get install -y cmake build-essential

In [None]:
!git clone https://github.com/ggerganov/llama.cpp

In [None]:
%cd llama.cpp

In [None]:
!ls -lh

In [None]:
!make -j

In [None]:
from huggingface_hub import snapshot_download

model_path = snapshot_download(
    repo_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    local_dir="tinyllama-hf",
    local_dir_use_symlinks=False
)

In [None]:
!pip install mistral-common

In [None]:
!python3 convert_hf_to_gguf.py ./tinyllama-hf \
    --outfile ./tinyllama-1.1b-chat.gguf

In [None]:
# !./bin/quantize ./tinyllama-1.1b-chat.gguf ./tinyllama-1.1b-chat-q4_0.gguf q4_0

In [None]:
!./main -m ./tinyllama-1.1b-chat-q4_0.gguf -p "Explain quantization in LLMs" -n 100

In [None]:
import os
os.getcwd()

In [None]:
!mkdir -p build

In [None]:
%cd build

!cmake .. â†’ generates build instructions (Makefile).
Think of it as creating a blueprint for how the code should be compiled.

!make â†’ follows those instructions and compiles the binaries.
This is the actual construction step â€” turning source code into executables.

Output = ./main, ./quantize, etc. â†’ now you can run them.

In [None]:
!cmake ..

In [None]:
!make

cmake + make â†’ build the inference engine (./bin/main).

quantize â†’ prepare .gguf quantized model.

./bin/main â†’ run inference with your prompt.

In [None]:
!ls

In [None]:
os.getcwd()

In [None]:
!ls -lh /content/llama.cpp/

In [None]:
!./bin/llama-cli -m ../tinyllama-1.1b-chat.gguf -p "What is quantization in LLMs?" -n 100


âœ… llama-cli (official main runner)

âœ… llama-run (multi-prompt / batch)

In [None]:
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# ==== Step 1: Load your document ====
def load_documents(file_path):
    print(f"Loading file: {file_path}")
    loader = TextLoader(file_path)
    return loader.load()

# ==== Step 2: Chunk the text ====
def chunk_documents(documents, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(documents)

# ==== Step 3: Create or load FAISS VectorStore ====
def create_or_load_faiss(chunks, embedding_model, index_path="faiss_index"):
    if os.path.exists(index_path):
        print("Loading existing FAISS index...")
        return FAISS.load_local(index_path, embedding_model)
    print("Creating new FAISS index...")
    db = FAISS.from_documents(chunks, embedding_model)
    db.save_local(index_path)
    return db

# ==== Step 4: Retrieve relevant chunks ====
def get_context_from_query(query, retriever):
    docs = retriever.get_relevant_documents(query)
    return "\n\n".join([doc.page_content for doc in docs])

# ==== Step 5: Build prompt & write to file ====
def build_prompt_file(context, query, prompt_file="prompt.txt"):
    prompt = f"""[INST] <<SYS>>
You are a helpful AI assistant. Use the context to answer the question.
<</SYS>>

Context:
{context}

Question: {query}
Answer: [/INST]
"""
    with open(prompt_file, "w") as f:
        f.write(prompt)
    print(f"Prompt written to {prompt_file}")

# ==== Step 6: Run llama.cpp with GGUF ====
def run_llama_cli(gguf_path, prompt_file="prompt.txt", n_predict=200):
    print("Running inference with llama.cpp...")
    os.system(f"./bin/llama-cli -m {gguf_path} -f {prompt_file} --n-predict {n_predict}")

# ==== === MAIN PIPELINE === ===
def rag_pipeline(
    doc_path="my_notes.txt",
    gguf_model_path="../tinyllama-1.1b-chat-q4_0.gguf",
    user_query="What is quantization in LLMs?"
):
    # Load & split
    docs = load_documents(doc_path)
    chunks = chunk_documents(docs)

    # Embeddings
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # FAISS
    vectorstore = create_or_load_faiss(chunks, embedding_model)
    retriever = vectorstore.as_retriever()

    # RAG
    context = get_context_from_query(user_query, retriever)
    build_prompt_file(context, user_query)

    # Inference
    run_llama_cli(gguf_model_path)

# ==== Entry Point ====
if __name__ == "__main__":
    rag_pipeline()


GGML (Georgi Gerganov Machine Learning) ek C-based runtime + tensor library hai jo:

Low-level CPU/GPU optimized inference engine hai

Mainly llama.cpp, whisper.cpp, stable-diffusion.cpp jaise projects use karte hain

Original format tha before GGUF came in

No Python dependency â€“ pure C/C++ based

ðŸ“Œ GGUF = file format

ðŸ“Œ GGML = inference engine + tensor library

Practical: Run a GGML-Format LLM Model (like ggml-model-q4.bin)

ðŸ§° Tools:

âœ… llama.cpp (same as GGUF)

âœ… Prequantized GGML model (e.g., from TheBloke)

âœ… main binary from llama.cpp

Step-by-Step GGML Inference

ðŸ”¹ Step 1: Clone & Build llama.cpp

In [None]:
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp
!make

Step 2: Download a GGML Model

Use any of the prequantized .bin models:

In [None]:
!wget https://huggingface.co/TheBloke/LLaMa-7B-GGML/resolve/main/ggml-model-q4_0.bin -O ggml-model-q4_0.bin

Step 3: Run Inference

In [None]:
./main -m ggml-model-q4_0.bin -p "What is quantization in machine learning?" -n 100

Prompt: What is quantization in machine learning?

Output: Quantization is the process of reducing the precision of the weights and activations of a neural network. It is commonly used for...

In [None]:
!pip -q install llama-cpp-python

from llama_cpp import Llama
# Use a small GGUF to demo; supply path to your .gguf
llm = Llama(model_path="/content/model-q4_0.gguf", n_gpu_layers=35)  # set 0 for CPU-only
out = llm("Explain GPTQ vs AWQ in 2 lines.", max_tokens=80)
print(out["choices"][0]["text"])
