In [1]:
import numpy as np
import torch
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
df = pd.read_csv("F:/Projects/Machine and Deep Learning/Depression_Severity/Datasets/train_lemma.csv") 
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,lemmas
0,0000997932d777bf,explanationwhy the edits made under my usernam...,0,0,0,0,0,0,"['explanationwhy', 'the', 'edit', 'make', 'und..."
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0,"[""d'aww"", 'he', 'match', 'this', 'background',..."
2,000113f07ec002fd,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0,"['hey', 'man', 'I', 'be', 'really', 'not', 'tr..."
3,0001b41b1c6bb37e,morei can't make any real suggestions on impro...,0,0,0,0,0,0,"['morei', 'can', 'not', 'make', 'any', 'real',..."
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0,"['you', 'sir', 'be', 'my', 'hero', 'any', 'cha..."
...,...,...,...,...,...,...,...,...,...
137136,ffe987279560d7ff,"and for the second time of asking, when your v...",0,0,0,0,0,0,"['and', 'for', 'the', 'second', 'time', 'of', ..."
137137,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0,"['you', 'should', 'be', 'ashamed', 'of', 'your..."
137138,ffee36eab5c267c9,"spitzer umm, theres no actual article for pros...",0,0,0,0,0,0,"['spitzer', 'umm', 'there', 's', 'no', 'actual..."
137139,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0,"['and', 'it', 'look', 'like', 'it', 'be', 'act..."


In [3]:
print(f"Dataset shape: {df.shape}")

Dataset shape: (137141, 9)


In [4]:
label_cols = [col for col in df.columns if col not in ['id', 'comment_text', 'lemmas']]
print(f"Label columns: {label_cols}")

Label columns: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [5]:
print("\nLabel distribution:")
for col in label_cols:
    pos_count = df[col].sum()
    total_count = len(df)
    print(f"  {col}: {pos_count}/{total_count} ({pos_count/total_count:.3f})")


Label distribution:
  toxic: 14084/137141 (0.103)
  severe_toxic: 1420/137141 (0.010)
  obscene: 7828/137141 (0.057)
  threat: 450/137141 (0.003)
  insult: 7354/137141 (0.054)
  identity_hate: 1289/137141 (0.009)


In [6]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train shape: {train_df.shape}, Validation shape: {val_df.shape}")

Train shape: (109712, 9), Validation shape: (27429, 9)


In [7]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-mini")
model = AutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-mini",
    num_labels=len(label_cols),
    problem_type="multi_label_classification",
    use_safetensors=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def tokenize_function(batch):
    encoding = tokenizer(
        batch["lemmas"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    
    labels = []
    for i in range(len(batch["lemmas"])):
        label_row = [float(batch[col][i]) for col in label_cols]  # cast here
        labels.append(label_row)
    
    encoding["labels"] = labels
    return encoding

In [10]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/109712 [00:00<?, ? examples/s]

Map:   0%|          | 0/27429 [00:00<?, ? examples/s]

In [11]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    # Convert to numpy
    if isinstance(logits, torch.Tensor):
        logits = logits.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    
    # Get probabilities using sigmoid
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    
    # Use simple 0.5 threshold for all labels
    preds = (probs >= 0.4).astype(int)
    
    print(f"Predictions shape: {preds.shape}")
    print(f"Labels shape: {labels.shape}")
    print(f"Positive predictions per label: {np.sum(preds, axis=0)}")
    print(f"Positive labels per label: {np.sum(labels, axis=0)}")
    
    # Compute metrics
    f1 = f1_score(labels, preds, average="micro", zero_division=0)
    precision = precision_score(labels, preds, average="micro", zero_division=0)
    recall = recall_score(labels, preds, average="micro", zero_division=0)
    
    # Per-label F1 for debugging
    f1_per_label = f1_score(labels, preds, average=None, zero_division=0)
    print(f"Per-label F1: {dict(zip(label_cols, f1_per_label))}")
    
    try:
        auc = roc_auc_score(labels, probs, average="micro")
    except ValueError:
        auc = 0.0
    
    return {
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "auc": auc
    }

In [19]:
training_args = TrainingArguments(
    output_dir="./temp_results",
    eval_strategy="epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
    remove_unused_columns=True,
    fp16=torch.cuda.is_available(),  
    dataloader_num_workers=8, 
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [21]:
print(torch.cuda.is_available())

True


In [22]:
print("Starting training...")
trainer.train()

# Final evaluation
print("\nFinal evaluation:")
final_metrics = trainer.evaluate()
print(f"Final F1: {final_metrics['eval_f1']:.4f}")
print(f"Final Precision: {final_metrics['eval_precision']:.4f}")
print(f"Final Recall: {final_metrics['eval_recall']:.4f}")
print(f"Final AUC: {final_metrics['eval_auc']:.4f}")

Starting training...


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Auc
1,0.0462,0.054274,0.740842,0.672934,0.823993,0.984476
2,0.0422,0.052211,0.766634,0.766228,0.76704,0.983677
3,0.0388,0.05346,0.763774,0.779618,0.748561,0.982947
4,0.0392,0.053301,0.761616,0.743331,0.780824,0.983439
5,0.0371,0.052912,0.761954,0.74956,0.774765,0.983855
6,0.0353,0.055346,0.755356,0.723316,0.790367,0.983504
7,0.0339,0.054785,0.75805,0.746258,0.770221,0.983103
8,0.0324,0.056547,0.756649,0.740467,0.773553,0.982629
9,0.0316,0.055928,0.753305,0.723758,0.785368,0.983328
10,0.0309,0.056602,0.753693,0.728648,0.780521,0.98285


Predictions shape: (27429, 6)
Labels shape: (27429, 6)
Positive predictions per label: [3362  384 1844   39 2085  370]
Positive labels per label: [2811.  304. 1607.   97. 1508.  275.]
Per-label F1: {'toxic': 0.7685080187915114, 'severe_toxic': 0.502906976744186, 'obscene': 0.8148362793393219, 'threat': 0.17647058823529413, 'insult': 0.7275257445032006, 'identity_hate': 0.5271317829457365}
Predictions shape: (27429, 6)
Labels shape: (27429, 6)
Positive predictions per label: [2736  344 1686   15 1610  218]
Positive labels per label: [2811.  304. 1607.   97. 1508.  275.]
Per-label F1: {'toxic': 0.8000721110510186, 'severe_toxic': 0.4876543209876543, 'obscene': 0.8296386273914363, 'threat': 0.21428571428571427, 'insult': 0.7543296985246953, 'identity_hate': 0.539553752535497}
Predictions shape: (27429, 6)
Labels shape: (27429, 6)
Positive predictions per label: [2627  308 1579   43 1523  259]
Positive labels per label: [2811.  304. 1607.   97. 1508.  275.]
Per-label F1: {'toxic': 0.796984

Predictions shape: (27429, 6)
Labels shape: (27429, 6)
Positive predictions per label: [2736  344 1686   15 1610  218]
Positive labels per label: [2811.  304. 1607.   97. 1508.  275.]
Per-label F1: {'toxic': 0.8000721110510186, 'severe_toxic': 0.4876543209876543, 'obscene': 0.8296386273914363, 'threat': 0.21428571428571427, 'insult': 0.7543296985246953, 'identity_hate': 0.539553752535497}
Final F1: 0.7666
Final Precision: 0.7662
Final Recall: 0.7670
Final AUC: 0.9837


In [17]:
save_dir = "prajwal_bert"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

('prajwal_bert\\tokenizer_config.json',
 'prajwal_bert\\special_tokens_map.json',
 'prajwal_bert\\vocab.txt',
 'prajwal_bert\\added_tokens.json',
 'prajwal_bert\\tokenizer.json')