In [None]:
!pip install -q "transformers>=4.42.0" "datasets>=2.20.0" "peft>=0.12.0" "accelerate>=0.30.0" einops

import torch
print("Using device:", "cuda" if torch.cuda.is_available() else "cpu")


In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


In [None]:
# Load full train split
dataset = load_dataset("Abhishekcr448/Hinglish-Everyday-Conversations-1M", split="train")
print("Total rows in original dataset:", len(dataset))
print(dataset[0])

# OPTIONAL: use a subset to keep training manageable
max_samples = 50000   # change if you want more/less
if len(dataset) > max_samples:
    dataset = dataset.shuffle(seed=42).select(range(max_samples))

print("Rows used for training:", len(dataset))
print(dataset.column_names)


In [None]:
def format_example(example):
    user = example["input"].strip()
    bot = example["output"].strip()
    # Simple conversation format; you can later change persona while generating
    text = f"<user>: {user}\n<assistant>: {bot}"
    return {"text": text}

formatted_ds = dataset.map(format_example, remove_columns=dataset.column_names)
print(formatted_ds[0])

# Train / eval split
splits = formatted_ds.train_test_split(test_size=0.01, seed=42)
train_raw = splits["train"]
eval_raw = splits["test"]

print("Train size:", len(train_raw), "Eval size:", len(eval_raw))


In [None]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model (no quantization, no bitsandbytes)
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch_dtype,
)
model.resize_token_embeddings(len(tokenizer))

model.to(device)
model.gradient_checkpointing_enable()
model.config.use_cache = False  # needed for gradient checkpointing

print("Model & tokenizer loaded.")


In [None]:
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # Common Qwen target modules
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


In [None]:
max_length = 256  # adjust if you want longer context

def tokenize_function(batch):
    tokens = tokenizer(
        batch["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    # For causal LM, labels are just input_ids shifted internally
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_train = train_raw.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

tokenized_eval = eval_raw.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("Tokenization complete!")
print(tokenized_train[0])


In [None]:
from transformers import TrainingArguments

batch_size = 2 if torch.cuda.is_available() else 1

training_args = TrainingArguments(
    output_dir="qwen_hinglish_lora",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-5,
    logging_steps=50,
    save_strategy="steps",
    warmup_ratio=0.05,
    fp16=torch.cuda.is_available(),  # Only use fp16 if GPU available
    push_to_hub=False,
    report_to="none"
)

print("TrainingArguments loaded successfully!")


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

print("Trainer initialized.")


In [None]:
print("\nðŸš€ Training started...\n")
trainer.train()
print("\nâœ… Training completed!\n")


In [None]:
save_dir = "qwen_hinglish_whatsapp_lora"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Model & tokenizer saved at:", save_dir)


In [None]:
def chat(prompt, max_new_tokens=64):
    model.eval()
    # Simple persona on top of fine-tuned behavior
    system_prefix = (
        "You are a flirty, casual Hinglish WhatsApp buddy. "
        "Reply in 1-2 short sentences, fun and natural.\n"
    )
    full_prompt = system_prefix + f"<user>: {prompt}\n<assistant>:"

    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Cut off the prompt part
    gen_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
    return text.strip()


# Quick sanity check
tests = [
    "Kya kar rahe ho?",
    "Kal coffee peene chale?",
    "Aaj mood thoda off hai...",
]

for t in tests:
    print("User :", t)
    print("Bot  :", chat(t))
    print("-" * 60)
