In [None]:
!pip install -q "transformers>=4.42.0" "datasets>=2.20.0" "peft>=0.12.0" "accelerate>=0.30.0" einops

import torch
print("Using device:", "cuda" if torch.cuda.is_available() else "cpu")


Using device: cuda


In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda


In [None]:
# Load full train split
dataset = load_dataset("Abhishekcr448/Hinglish-Everyday-Conversations-1M", split="train")
print("Total rows in original dataset:", len(dataset))
print(dataset[0])

# OPTIONAL: use a subset to keep training manageable
max_samples = 50000   # change if you want more/less
if len(dataset) > max_samples:
    dataset = dataset.shuffle(seed=42).select(range(max_samples))

print("Rows used for training:", len(dataset))
print(dataset.column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

hinglish_conversations.csv:   0%|          | 0.00/185M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1001323 [00:00<?, ? examples/s]

Total rows in original dataset: 1001323
{'input': 'kya yaar,traffic mein stuck ho gaya.', 'output': 'arre, mere ko bhi late hoga ab!'}
Rows used for training: 50000
['input', 'output']


In [None]:
def format_example(example):
    user = example["input"].strip()
    bot = example["output"].strip()
    # Simple conversation format; you can later change persona while generating
    text = f"<user>: {user}\n<assistant>: {bot}"
    return {"text": text}

formatted_ds = dataset.map(format_example, remove_columns=dataset.column_names)
print(formatted_ds[0])

# Train / eval split
splits = formatted_ds.train_test_split(test_size=0.01, seed=42)
train_raw = splits["train"]
eval_raw = splits["test"]

print("Train size:", len(train_raw), "Eval size:", len(eval_raw))


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'text': '<user>: Humare budget mein ek achhi SUV lena thoda mushkil ho sakta hai, kya lagta hai?\n<assistant>: Par SUV ka space aur comfort toh family trips ke liye perfect hoga, sochna padega.'}
Train size: 49500 Eval size: 500


In [None]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model (no quantization, no bitsandbytes)
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch_dtype,
)
model.resize_token_embeddings(len(tokenizer))

model.to(device)
model.gradient_checkpointing_enable()
model.config.use_cache = False  # needed for gradient checkpointing

print("Model & tokenizer loaded.")


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Model & tokenizer loaded.


In [None]:
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # Common Qwen target modules
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


trainable params: 18,464,768 || all params: 1,561,762,816 || trainable%: 1.1823


In [None]:
max_length = 256  # adjust if you want longer context

def tokenize_function(batch):
    tokens = tokenizer(
        batch["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    # For causal LM, labels are just input_ids shifted internally
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_train = train_raw.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

tokenized_eval = eval_raw.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("Tokenization complete!")
print(tokenized_train[0])


Map:   0%|          | 0/49500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenization complete!
{'input_ids': tensor([    27,    872,  26818,   4697,    773,    331,    435,  51106,    305,
          9009,    342,  12982,  15236,  26857,   4128,   2906,  41235,   3873,
           278,    653,    263,     11,  19408,   9325,  15459,    274,  30503,
          3362,   4894,     27,  77091,  26818,  11434,   2906,  33261,   4223,
         46051,     11,   1346,  26839,    383,    311,     71,  26857,  20567,
         48627,    850,    521,  51106,   9011,     11,   6708,  15236,  78430,
           595,  39215,  46051,      0, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
   

In [None]:
from transformers import TrainingArguments

batch_size = 2 if torch.cuda.is_available() else 1

training_args = TrainingArguments(
    output_dir="qwen_hinglish_lora",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    # num_train_epochs=1,
    learning_rate = 2e-4
    num_train_epochs = 2
    rank = 32
    lora_alpha = 32
    lora_dropout = 0.05
    # learning_rate=2e-5,
    logging_steps=50,
    save_strategy="steps",
    warmup_ratio=0.05,
    fp16=torch.cuda.is_available(),  # Only use fp16 if GPU available
    push_to_hub=False,
    report_to="none"
)

print("TrainingArguments loaded successfully!")


trainer = Trainer(
    model=model,
    args=training_asrgs,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

print("Trainer initialized.")


TrainingArguments loaded successfully!
Trainer initialized.


In [None]:
print("\n🚀 Training started...\n")
trainer.train()
print("\n✅ Training completed!\n")



🚀 Training started...



Step,Training Loss
50,7.3869
100,3.8823
150,0.994
200,0.7734
250,0.7109
300,0.6579
350,0.639
400,0.6108
450,0.5954
500,0.5841





✅ Training completed!



In [None]:
save_dir = "qwen_hinglish_whatsapp_lora"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Model & tokenizer saved at:", save_dir)




Model & tokenizer saved at: qwen_hinglish_whatsapp_lora


In [None]:
def chat(prompt, max_new_tokens=64):
    model.eval()
    # Simple persona on top of fine-tuned behavior
    system_prefix = (
        "You are a flirty, casual Hinglish WhatsApp buddy. "
        "Reply in 1-2 short sentences, fun and natural.\n"
    )
    full_prompt = system_prefix + f"<user>: {prompt}\n<assistant>:"

    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Cut off the prompt part
    gen_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
    return text.strip()


# Quick sanity check
tests = [
    "Kya kar rahe ho?",
    "Kal coffee peene chale?",
    "Aaj mood thoda off hai...",
]

for t in tests:
    print("User :", t)
    print("Bot  :", chat(t))
    print("-" * 60)


User : Kya kar rahe ho?
Bot  : Chalo, let's have dinner!
------------------------------------------------------------
User : Kal coffee peene chale?
Bot  : Bilkul! Phir thoda chai bhi order karte hain, yaar.
------------------------------------------------------------
User : Aaj mood thoda off hai...
Bot  : Lekin tumhara chachi ka woh dance toh badiya hota tha!
------------------------------------------------------------


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Zip everything inside /content
!zip -r /content/.zip /content

# Download the zip file
from google.colab import files
files.download('/content/content_backup.zip')


  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: content/.config/config_sentinel (stored 0%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.config/gce (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2025.11.20/ (stored 0%)
  adding: content/.config/logs/2025.11.20/14.30.36.623222.log (deflated 58%)
  adding: content/.config/logs/2025.11.20/14.30.45.937471.log (deflated 56%)
  adding: content/.config/logs/2025.11.20/14.30.04.285207.log (deflated 93%)
  adding: content/.config/logs/2025.11.20/14.30.35.382199.log (deflated 87%)
  adding: co

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
!ls -lh /content


total 4.0K
drwxr-xr-x 1 root root 4.0K Nov 20 14:30 sample_data
