#Runtime and packages

In [1]:
import os
os.environ["HF_TOKEN"] = "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"
os.environ["HUGGINGFACE_TOKEN"] = os.environ["HF_TOKEN"]
print("HF token set (warning: this cell contains a hardcoded secret).")



#Quick Verification

In [2]:
from huggingface_hub import HfApi
api = HfApi()
me = api.whoami(token=os.environ["HF_TOKEN"])
print("Logged in as:", me.get("name", "unknown"))

Logged in as: GfDeg


In [3]:
#@title Setup: GPU, packages, and utilities (Colab-safe)
import os, sys, subprocess, importlib, json, gc
from datetime import datetime

def pip_install(pkgs):
    to_install = []
    for spec in pkgs:
        # Extract package name before any version specifiers
        name = spec.split("==")[0].split(">=")[0].split("<")[0]
        try:
            importlib.import_module(name.replace("-", "_"))
        except Exception:
            to_install.append(spec)
    if to_install:
        print("Installing:", to_install)
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet"] + to_install)

# Keep Colab's numpy/scipy; install only what we need
REQUIREMENTS = [
    # Core LLM stack
    "transformers>=4.45.0",
    "accelerate>=0.33.0",
    "sentence-transformers>=3.0.0",
    "bitsandbytes>=0.43.1", # Added bitsandbytes
    # Small vector index, CPU-friendly
    "faiss-cpu>=1.8.0",
    # Simple web/app UI
    "gradio>=4.44.0",
    # Optional: request fetching for RAG sources
    "beautifulsoup4>=4.12.3",
    "requests>=2.32.0",
    # Safe tensor loading
    "safetensors>=0.4.3",
]

pip_install(REQUIREMENTS)

import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
print("Setup time:", datetime.now().isoformat())

Installing: ['bitsandbytes>=0.43.1', 'faiss-cpu>=1.8.0', 'beautifulsoup4>=4.12.3']
CUDA available: True
GPU: Tesla T4
Setup time: 2025-08-12T21:44:36.302524


## Model Loading

Uses a lightweight instruction model.

Loads in 4-bit with bitsandbytes if available for minimal VRAM; falls back to 8-bit or FP16.

In [4]:
!pip install -U bitsandbytes



In [5]:
# Load the model
#@title Config: small, fast instruction model + quantized loading
import os, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"  # good balance for Colab T4
# Alternatives:
# MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
# MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

USE_4BIT = False  # set False if issues arise - disabling due to import errors
DEVICE = 0 if torch.cuda.is_available() else -1

def load_model(model_id=MODEL_ID, use_4bit=USE_4BIT):
    model_kwargs = dict(
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    )
    quant_args = {}
    if use_4bit:
        try:
            from transformers import BitsAndBytesConfig
            quant_args["quantization_config"] = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
            )
            model_kwargs["device_map"] = "auto" if DEVICE == -1 else {"": DEVICE}
            print("Using 4-bit quantization")
        except Exception as e:
            print("4-bit not available, falling back to 8-bit/FP16:", e)
            use_4bit = False # Explicitly set use_4bit to False
            quant_args = {} # Clear quant_args

    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    mdl = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, **model_kwargs, **quant_args)

    pipeline_kwargs = dict(
        model=mdl,
        tokenizer=tok,
        # Removed torch_dtype here as it's inferred from the model
    )
    # if not use_4bit and DEVICE != -1:  # Removed this line
    #      pipeline_kwargs["device"] = DEVICE # Specify device for pipeline if not using device_map

    gen = pipeline("text-generation", **pipeline_kwargs)

    return tok, mdl, gen

tokenizer, model, generator = load_model(MODEL_ID, USE_4BIT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cuda:0


#Efficient chat wrapper

Stateless generation with system prompt, max tokens, and streaming support for Gradio.

Uses low max_new_tokens and top_k/top_p for speed and coherence.


In [6]:
#@title Chat helper with small, efficient defaults
SYSTEM_PROMPT = "You are a helpful, concise assistant."

GEN_KW = {
    "max_new_tokens": 256,
    "temperature": 0.6,
    "top_p": 0.9,
    "top_k": 50,
    "do_sample": True,
    "repetition_penalty": 1.05,
    "pad_token_id": tokenizer.eos_token_id,
}

def format_messages(history, user_msg):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for h in history:
        messages.append({"role": "user", "content": h[0]})
        messages.append({"role": "assistant", "content": h[1]})
    messages.append({"role": "user", "content": user_msg})
    return messages

def chat_once(history, user_msg):
    messages = format_messages(history, user_msg)
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    out = generator(prompt, **GEN_KW)[0] # Get the first item from the list
    # Extract only the assistant's final turn
    if isinstance(out, dict) and "generated_text" in out:
        resp = out["generated_text"]
        # Split by assistant tag if present
        sep = "<|assistant|>"
        resp = resp.split(sep)[-1].strip() if sep in resp else resp[len(prompt):].strip()
    else:
        resp = str(out) # Fallback to string representation if unexpected format
    return resp

#Optional RAG: tiny embedding + FAISS index

Keeps it minimal with a small embedding model.

Works fine on CPU; moves only generation to GPU.

In [7]:
#@title Optional: Lightweight RAG (sentence-transformers + FAISS)
import faiss, numpy as np
from sentence_transformers import SentenceTransformer
import requests
from bs4 import BeautifulSoup

EMB_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"  # compact, fast
embedder = SentenceTransformer(EMB_MODEL_ID, device="cpu")  # embeddings on CPU

class MiniRAG:
    def __init__(self, k=3):
        self.k = k
        self.docs = []
        self.index = None

    def build(self, texts):
        self.docs = texts
        embs = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
        d = embs.shape[1]
        self.index = faiss.IndexFlatIP(d)
        self.index.add(embs.astype(np.float32))

    def retrieve(self, query):
        if self.index is None or not self.docs:
            return []
        q = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
        D, I = self.index.search(q.astype(np.float32), self.k)
        return [self.docs[i] for i in I.flatten() if i >= 0] # Use I.flatten() to handle potential multi-dimensional array


rag = MiniRAG(k=3)

# Fetch content from the URL
url = "https://www.rbi.org.in/commonman/english/Scripts/FAQs.aspx?Id=3782"
try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract text from the webpage (adjust selectors based on the website's structure)
    # This is a basic extraction, you might need to inspect the page source to find the correct tags/classes
    # for the FAQ content.
    paragraphs = soup.find_all('p')
    corpus = [p.get_text().strip() for p in paragraphs if p.get_text().strip()]

    if not corpus:
        # If no paragraphs, try extracting text from divs
        divs = soup.find_all('div')
        corpus = [d.get_text().strip() for d in divs if d.get_text().strip()]

    if not corpus:
        print("Could not extract meaningful text from the URL.")
        corpus = ["Could not load content from the provided URL."] # Fallback

except requests.exceptions.RequestException as e:
    print(f"Error fetching the URL: {e}")
    corpus = ["Could not load content from the provided URL."] # Fallback
except Exception as e:
    print(f"Error parsing the content: {e}")
    corpus = ["Could not load content from the provided URL."] # Fallback


# Build the RAG index with the fetched corpus
rag.build(corpus)
print(f"RAG index built with {len(corpus)} documents from {url}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

RAG index built with 121 documents from https://www.rbi.org.in/commonman/english/Scripts/FAQs.aspx?Id=3782


#Gradio UI

Streams responses for fast interactivity.

Toggle RAG context to improve answers on domain data.

In [8]:
#@title Gradio Chat UI (streaming)
import gradio as gr

def respond(user_msg, history, use_rag):
    history = history or []
    try:
        ctx = ""
        if use_rag and user_msg:
            top_docs = rag.retrieve(user_msg)
            if top_docs:
                ctx = "Relevant context:\n" + "\n".join(f"- {t}" for t in top_docs) + "\n\n"
        # Combine context and user message before passing to chat_once
        message_with_context = ctx + user_msg
        reply = chat_once(history, message_with_context)
        history.append((user_msg, reply))
        # memory management
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        return reply, history
    except Exception as e:
        # Print the error and return an error message to the user
        print(f"Error in chat: {e}")
        return f"An error occurred: {e}", history

with gr.Blocks(title="BotProject — AI Chatbot") as demo:
    gr.Markdown("### BotProject — Efficient AI Chatbot (Colab GPU)")
    with gr.Row():
        use_rag = gr.Checkbox(label="Use RAG context", value=False)
    chat = gr.Chatbot(height=420)
    msg = gr.Textbox(placeholder="Type your question...", scale=4)
    clear = gr.Button("Clear")

    def on_submit(user_msg, chat_history, use_rag_flag):
        reply, chat_history = respond(user_msg, chat_history, use_rag_flag)
        return "", chat_history

    msg.submit(on_submit, [msg, chat, use_rag], [msg, chat])
    clear.click(lambda: None, None, chat, queue=False)

demo.launch(share=False, debug=False)

  chat = gr.Chatbot(height=420)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

