<a href="https://colab.research.google.com/github/HartinderSingh/IETE_GitTutorial/blob/master/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import torch
import json
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
import torch.nn as nn

def preprocess_text(text):
    """Clean and normalize Punjabi text"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[!?]{2,}', lambda x: x.group()[0], text)
    return text

def load_data(file_path="punjabi_songs_with_mood.json"):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    df = pd.DataFrame(data)
    df = df.dropna(subset=['lyrics', 'mood'])
    df['lyrics'] = df['lyrics'].apply(preprocess_text)
    df = df[df['lyrics'].str.strip() != '']
    df = df[df['lyrics'].str.split().str.len() >= 4]

    print(f"Loaded {len(df)} samples")
    print("Class distribution:\n", df['mood'].value_counts())
    return df

def prepare_dataset(df):
    label_encoder = LabelEncoder()
    df['labels'] = label_encoder.fit_transform(df['mood'])

    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)
    train_dataset = Dataset.from_pandas(train_df[['lyrics', 'labels']].reset_index(drop=True))
    val_dataset = Dataset.from_pandas(val_df[['lyrics', 'labels']].reset_index(drop=True))

    return {'train': train_dataset, 'validation': val_dataset}, label_encoder

def tokenize_dataset(dataset, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_fn(examples):
        return tokenizer(
            examples["lyrics"],
            truncation=True,
            padding="max_length",
            max_length=192,
            return_attention_mask=True
        )

    return {k: v.map(tokenize_fn, batched=True) for k, v in dataset.items()}, tokenizer

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)

        loss_fct = nn.CrossEntropyLoss(
            weight=self.class_weights.to(outputs.logits.device) if self.class_weights is not None else None,
            label_smoothing=0.05
        )
        loss = loss_fct(outputs.logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def train_model(tokenized_dataset, tokenizer, label_encoder, model_name, output_dir):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label_encoder.classes_),
        hidden_dropout_prob=0.25,
        attention_probs_dropout_prob=0.25,
        classifier_dropout=0.15
    )

    class_weights = torch.FloatTensor(
        compute_class_weight(
            class_weight='balanced',
            classes=np.unique(tokenized_dataset["train"]["labels"]),
            y=tokenized_dataset["train"]["labels"]
        )
    ).clamp(min=0.5, max=4.0)

    print("Class weights:", class_weights)

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",  # <- `eval_strategy` instead of `evaluation_strategy`
        save_strategy="epoch",
        learning_rate=2.5e-5,
        per_device_train_batch_size=12,
        per_device_eval_batch_size=24,
        num_train_epochs=10,
        weight_decay=0.035,
        warmup_ratio=0.06,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        logging_steps=30,
        fp16=torch.cuda.is_available(),
        lr_scheduler_type='cosine',
        optim="adamw_torch_fused",
        max_grad_norm=0.85,
        report_to="none",
        seed=42,
        dataloader_pin_memory=False  # Avoid warning on CPU
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=lambda p: {
            "accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
            "f1": precision_recall_fscore_support(
                p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')[2]
        },
        callbacks=[
            EarlyStoppingCallback(
                early_stopping_patience=3,
                early_stopping_threshold=0.008
            )
        ],
        class_weights=class_weights
    )

    print("\nStarting training with loss convergence monitoring...")
    trainer.train()

    history = trainer.state.log_history
    train_losses = [x['loss'] for x in history if 'loss' in x]
    eval_losses = [x['eval_loss'] for x in history if 'eval_loss' in x]

    print("\nTraining Report:")
    print(f"Final Train Loss: {train_losses[-1]:.4f}")
    print(f"Final Eval Loss: {eval_losses[-1]:.4f}")
    print(f"Loss Difference: {abs(train_losses[-1] - eval_losses[-1]):.4f}")

    eval_results = trainer.evaluate()
    print(f"\nValidation Accuracy: {eval_results['eval_accuracy']:.2%}")
    print(f"Validation F1: {eval_results['eval_f1']:.2%}")

    if 0.70 <= eval_results['eval_accuracy'] <= 0.80:
        print("✅ Target accuracy achieved with good loss convergence!")
    elif eval_results['eval_accuracy'] < 0.70:
        print("⚠️ Try: Increase epochs to 12 or reduce dropout slightly")
    else:
        print("⚠️ Try: Add more dropout or increase weight decay")

    return trainer

def save_model(trainer, tokenizer, label_encoder, output_dir):
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    with open(f"{output_dir}/label_mapping.json", "w") as f:
        json.dump({
            "id2label": {i: label for i, label in enumerate(label_encoder.classes_)},
            "label2id": {label: i for i, label in enumerate(label_encoder.classes_)}
        }, f)

def main():
    DATA_FILE = "punjabi_songs_with_mood.json"
    MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
    OUTPUT_DIR = "./punjabi_mood_model"

    print("Loading data...")
    df = load_data(DATA_FILE)

    print("\nPreparing dataset...")
    dataset, label_encoder = prepare_dataset(df)

    print("\nTokenizing data...")
    tokenized_dataset, tokenizer = tokenize_dataset(dataset, MODEL_NAME)

    print("\nTraining model...")
    trainer = train_model(
        tokenized_dataset,
        tokenizer,
        label_encoder,
        MODEL_NAME,
        OUTPUT_DIR
    )

    print("\nSaving model...")
    save_model(trainer, tokenizer, label_encoder, OUTPUT_DIR)
    print(f"Model saved to {OUTPUT_DIR}")

if __name__ == "__main__":
    main()


Loading data...
Loaded 4455 samples
Class distribution:
 mood
romantic     2042
energetic     860
sad           745
nostalgic     345
happy         314
spiritual      94
angry          55
Name: count, dtype: int64

Preparing dataset...

Tokenizing data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/3564 [00:00<?, ? examples/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]


Training model...


config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: tensor([4.0000, 0.7400, 2.0285, 1.8447, 0.5000, 0.8543, 4.0000])

Starting training with loss convergence monitoring...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.0369,2.072306,0.088664,0.043568
2,1.8821,1.849827,0.448934,0.436702
3,1.7626,1.682108,0.563412,0.548996
4,1.6548,1.670294,0.500561,0.514496
5,1.5822,1.596048,0.570146,0.568155
6,1.411,1.638725,0.609428,0.591618
7,1.2996,1.584307,0.600449,0.586447
8,1.1845,1.606554,0.588103,0.580946
9,1.2677,1.608759,0.589226,0.581083
10,1.2024,1.606987,0.586981,0.579723


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Report:
Final Train Loss: 1.2024
Final Eval Loss: 1.6070
Loss Difference: 0.4046



Validation Accuracy: 60.04%
Validation F1: 58.64%
⚠️ Try: Increase epochs to 12 or reduce dropout slightly

Saving model...
Model saved to ./punjabi_mood_model
