In [1]:
# ========== 0. Install dependencies ==========
!pip install -q bitsandbytes transformers accelerate peft datasets torch sentencepiece \
    sentence-transformers faiss-cpu evaluate rouge_score
!pip install langdetect



In [2]:
# ========== 1. Mount Drive ==========
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [3]:
# ========== 2. Paths ==========
import os
BASE_DIR = "/content/drive/MyDrive/guvi_chatbot_project"
DATA_PATH = os.path.join(BASE_DIR, "data", "guvi_course_info_io.jsonl")
ADAPTER_DIR = os.path.join(BASE_DIR, "adapters/final")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
HF_CACHE = os.path.join(BASE_DIR, "base_model", "hf_cache")

os.makedirs(ADAPTER_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(HF_CACHE, exist_ok=True)

In [4]:
# ========== 3. Load base model & tokenizer ==========
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig, pipeline as hf_pipeline
MODEL_ID = "MBZUAI/LaMini-Flan-T5-783M"
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

print("Loading model (8-bit)...")
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_ID, device_map="auto", quantization_config=bnb_config, cache_dir=HF_CACHE
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=HF_CACHE)
print("Model + tokenizer loaded.")

Loading model (8-bit)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model + tokenizer loaded.


In [7]:
# ========== 4. Translator Model ==========
NLLB = "facebook/nllb-200-distilled-600M"
print("Loading NLLB translation model and tokenizer...")
nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB, cache_dir=HF_CACHE)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB, device_map="auto", cache_dir=HF_CACHE)

# build a pipeline wrapper (we will pass src_lang / tgt_lang per call)
translator = hf_pipeline("translation", model=nllb_model, tokenizer=nllb_tokenizer)
print("Translator ready.")

# Simple language detection + mapping to NLLB codes:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0 # make detection deterministic
LANG_MAP = {
    "en": "eng_Latn",
    "hi": "hin_Deva",
    "bn": "ben_Beng",
    "te": "tel_Telu",
    "ta": "tam_Taml",
    "mr": "mar_Deva",
    "gu": "guj_Gujr",
    "pa": "pan_Guru",
    "or": "ori_Orya",
    "kn": "kan_Knda",
    "ml": "mal_Mlym",
    "ur": "urd_Arab",
    "fr": "fra_Latn",
    "es": "spa_Latn",
    "de": "deu_Latn",
    "ru": "rus_Cyrl",
    "zh-cn": "zho_Hans",
    "zh": "zho_Hans",
    "ar": "ara_Arab",
    "pt": "por_Latn" }

def detect_lang(text):
  try:
    code = detect(text.lower().strip())
    return code if code in LANG_MAP else None
  except Exception:
    return None

def langdetect_to_nllb(code):
  if not code: return None
  return LANG_MAP.get(code, LANG_MAP.get("en"))

def clean_text(text):
  """Normalize text before translation for better fidelity."""
  text = text.strip()
  text = " ".join(text.split())
  return text

def translate_text(text, src, tgt, max_length=1024):
  """Robust translation using NLLB with batching and normalization."""
  text = clean_text(text)
  if src == tgt or not src or not tgt:
      return text
  try:
      result = translator(text, src_lang=src, tgt_lang=tgt, max_length=max_length)
      if isinstance(result, list) and len(result) > 0:
        return result[0].get("translation_text", text)
      return str(result)
  except Exception as e:
      print(f"[WARN] Translation failed ({src}->{tgt}):", e)
      return text

Loading NLLB translation model and tokenizer...


Device set to use cuda:0


Translator ready.


In [8]:
# ========== 5. Load dataset and split ==========
from datasets import load_dataset, DatasetDict
print("Loading dataset:", DATA_PATH)
raw = load_dataset("json", data_files=DATA_PATH, split="train")
print("Dataset size:", len(raw))

# Shuffle and split into train/validation (90/10)
raw = raw.shuffle(seed=42)
split = raw.train_test_split(test_size=0.10, seed=42)
train_ds = split["train"]
eval_ds = split["test"]
print("Train / Eval sizes:", len(train_ds), len(eval_ds))

# Quick peek (ensure your jsonl has 'input' and 'output' keys)
print("Columns:", train_ds.column_names)
print("Example:", train_ds[0])

Loading dataset: /content/drive/MyDrive/guvi_chatbot_project/data/guvi_course_info_io.jsonl
Dataset size: 119
Train / Eval sizes: 107 12
Columns: ['input', 'output']
Example: {'input': 'Answer as a GUVI course assistant: What topics are in the free C course?', 'output': 'It covers Operators, Loops, File operations, and Preprocessor directives. Free to learn; fee for GUVI Certification. Visit guvi.in for more details.'}


In [9]:
# ========== 6. Prompt format used for training ==========

def make_training_prompt(question, context=""):
    # keep format consistent for both train and inference
    if context:
        return f"Context: {context}\nQuestion: {question}\nAnswer:"
    return f"Question: {question}\nAnswer:"

In [10]:
# ========== 7. Tokenization (use collator for dynamic padding) ==========
MAX_SOURCE_LENGTH = 256
MAX_TARGET_LENGTH = 128

def preprocess_fn(batch):
    questions = batch["input"]
    answers = batch["output"]

    inputs = [make_training_prompt(q, "") for q in questions]
    model_inputs = tokenizer(
        inputs,
        truncation=True,
        max_length=MAX_SOURCE_LENGTH,
        padding="max_length",
        return_tensors="pt"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            answers,
            truncation=True,
            padding="max_length",
            max_length=MAX_TARGET_LENGTH,
            return_tensors="pt"
        )["input_ids"]

    # Replace padding token ids with -100
    labels_masked = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels.tolist()
    ]
    model_inputs["labels"] = labels_masked

    # Safely convert all tensors to lists
    processed = {}
    for k, v in model_inputs.items():
        if hasattr(v, "tolist"):
            processed[k] = v.tolist()
        else:
            processed[k] = v

    return processed

# Apply tokenization
train_tok = train_ds.map(preprocess_fn, batched=True, remove_columns=train_ds.column_names)
eval_tok = eval_ds.map(preprocess_fn, batched=True, remove_columns=eval_ds.column_names)
print("Tokenized shapes example:", {k: len(train_tok[0][k]) for k in train_tok.column_names})


Tokenized shapes example: {'input_ids': 256, 'attention_mask': 256, 'labels': 128}


In [11]:
# ========== 8. LoRA / PEFT setup (optional but you had it) ==========
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "k", "v", "o"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 787,868,672 || trainable%: 0.5989


In [12]:
# ========== 9. Training args & trainer ==========
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
rouge = evaluate.load("rouge")

training_args = Seq2SeqTrainingArguments(
    output_dir=CHECKPOINT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=2,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    logging_steps=20,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    save_total_limit=3,
    push_to_hub=False,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest", return_tensors="pt")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # labels come as token ids with -100 for pad
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # convert to percentages
    result = {k: float(v.mid.fmeasure * 100) for k, v in result.items()}
    # add avg length
    prediction_lens = [len(tokenizer.encode(p)) for p in decoded_preds]
    result["gen_len"] = sum(prediction_lens) / max(1, len(prediction_lens))
    return result

from transformers import set_seed
set_seed(42)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Turn off WANDB in colab to keep logs local
import os
os.environ["WANDB_MODE"] = "disabled"

  trainer = Seq2SeqTrainer(


In [13]:
# ========== 10. Train ==========
trainer.train()


  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss


TrainOutput(global_step=42, training_loss=0.0, metrics={'train_runtime': 201.0592, 'train_samples_per_second': 1.597, 'train_steps_per_second': 0.209, 'total_flos': 372241786208256.0, 'train_loss': 0.0, 'epoch': 3.0})

In [14]:
# ========== 11. Save adapter + tokenizer ==========
model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print("Saved adapter & tokenizer to:", ADAPTER_DIR)


Saved adapter & tokenizer to: /content/drive/MyDrive/guvi_chatbot_project/adapters/final


In [15]:
# ========== 12. Build retrieval (FAISS) safely ==========
USE_RETRIEVAL = True
if USE_RETRIEVAL:
    from sentence_transformers import SentenceTransformer
    import faiss, numpy as np
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
    # corpus: preferably use a cleaned short context field, not full pair
    corpus_texts = [d["input"] + " || " + d.get("output", "") for d in train_ds]
    embeddings = embed_model.encode(corpus_texts, convert_to_numpy=True, show_progress_bar=True)
    faiss.normalize_L2(embeddings)
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(embeddings)
    faiss.write_index(index, os.path.join(BASE_DIR, "faiss_index.idx"))
    print("Built FAISS index with", index.ntotal, "vectors.")


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Built FAISS index with 107 vectors.


In [16]:
# ========== 13. Reload model + adapter (inference-ready) ==========
from peft import PeftModel
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, quantization_config=bnb_config, device_map="auto", cache_dir=HF_CACHE)
model = PeftModel.from_pretrained(model, ADAPTER_DIR)  # loads LoRA adapter
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, cache_dir=HF_CACHE)
model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear8bitLt(
                    (base_layer): Linear8bitLt(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict

In [17]:
# ========== 14. Inference function with retrieval guard + multilingual wrapper ==========
import numpy as np

def generate_response_english(prompt, use_retrieval=True, top_k=2, max_new_tokens=120, num_beams=4, temperature=0.1, top_p=0.95):

    context_prefix = ""
    if USE_RETRIEVAL and use_retrieval:
        q_emb = embed_model.encode([prompt], convert_to_numpy=True)
        faiss.normalize_L2(q_emb)
        D, I = index.search(q_emb, top_k)
        retrieved = []
        for score, idx in zip(D[0], I[0]):
            if idx < len(corpus_texts) and score > 0.2:
                retrieved.append(corpus_texts[idx])
        if retrieved:
            context_prefix = "Context: " + " || ".join(retrieved[:2]) + "\n\n"
    full_prompt = (context_prefix + "Question: " + prompt + "\nAnswer:").strip()
    inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=MAX_SOURCE_LENGTH).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        num_beams=num_beams,
        early_stopping=True,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    resp = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Answer:" in resp:
        resp = resp.split("Answer:")[-1].strip()
    return resp


def generate_multilingual_response(user_text, user_specified_lang=None, use_retrieval=True, **kwargs):

    # Step 1: Detect language
    detected = user_specified_lang or detect_lang(user_text)
    src_nllb = langdetect_to_nllb(detected)
    if not src_nllb:
      src_nllb = "eng_Latn"
      detected = "en"

    # Step 2: Translate input → English if not already English
    if src_nllb != "eng_Latn":
      english_prompt = translate_text(user_text, src_nllb, "eng_Latn")
    else:
      english_prompt = user_text

    # Step 3: Generate English answer
    english_reply = generate_response_english(
        english_prompt,
        use_retrieval=use_retrieval,
        num_beams=4,
        temperature=0.1, # small creative boost improves translation back
        **kwargs )

    # Step 4: Translate English answer → user language
    if src_nllb != "eng_Latn":
      final_reply = translate_text(english_reply, "eng_Latn", src_nllb)
    else:
      final_reply = english_reply

    return final_reply.strip(), detected



In [16]:
# ========== 15. Quick local checks on validation set (multilingual) ==========
for i in range(5):
    sample = eval_ds[i]["input"]
    print("Q:", sample)
    print("GT:", eval_ds[i].get("output", ""))
    pred, det = generate_multilingual_response(sample)
    print("PRED (detected_lang={}):".format(det), pred)
    print("-" * 60)

Q: Answer as a GUVI course assistant: What free courses does GUVI offer?
GT: GUVI offers free courses including Python Programming, C Programming, Data Science & Analytics, Ethical Hacking, and Introduction to Cloud Computing. There are many more free courses available. Visit guvi.in for more details.


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


PRED (detected_lang=en): No, this question is not answerable as it is asking about the free courses offered by GUVI, while the given context is about the paid courses offered.
------------------------------------------------------------
Q: Answer as a GUVI course assistant: How do I contact GUVI support?
GT: You can write to cs@guvi.in or call +91 97360 97320. You can also visit the 'Contact Us' page on guvi.in. Visit guvi.in for more details.
PRED (detected_lang=en): To contact GUVI support, visit guvi.in for full syllabus, pricing, tools, and placement support details.
------------------------------------------------------------
Q: Answer as a GUVI course assistant: Tell me about the Java Full-stack Development program.
GT: This is a 3-month (weekdays) or 5-month (weekends) professional full-stack development program with placement support. It’s available in English. You’ll learn Java, HTML, CSS, JavaScript, Spring, MySQL, MongoDB, AWS, Git, Maven, JUnit, Bootstrap, and Eclipse. You’

In [24]:
tests = [
    "ಡೇಟಾ ಸೈನ್ಸ್ ಕೋರ್ಸ್‌ನ ಅವಧಿ ಎಷ್ಟು?"
]

for t in tests:
    resp, det = generate_multilingual_response(t)
    print(f"\nPrompt ({det}): {t}")
    print("Response:", resp)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Prompt (kn): ಡೇಟಾ ಸೈನ್ಸ್ ಕೋರ್ಸ್‌ನ ಅವಧಿ ಎಷ್ಟು?
Response: ಮಾಸ್ಟರ್ ಡಾಟಾ ಸೈನ್ಸ್ ಕೋರ್ಸ್ನ ಅವಧಿಯು 3 ತಿಂಗಳು (ವಾರದ ದಿನಗಳು) ಅಥವಾ 5 ತಿಂಗಳು (ವಾರಾಂತ್ಯಗಳು).
