In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import (
    DebertaV2Tokenizer, 
    DebertaV2ForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    get_polynomial_decay_schedule_with_warmup
)
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from scipy.special import expit  # for sigmoid in compute_metrics function
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
from torch import nn

# Define the inappropriateness dimensions and labels
DIMS = [
    'Inappropriateness', 'Toxic Emotions', 'Excessive Intensity', 'Emotional Deception', 
    'Missing Commitment', 'Missing Seriousness', 'Missing Openness', 'Missing Intelligibility', 
    'Unclear Meaning', 'Missing Relevance', 'Confusing Reasoning', 'Other Reasons', 
    'Detrimental Orthography', 'Reason Unclassified'
]

# Load the tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-large', num_labels=len(DIMS))

# Load and preprocess data
train_df = pd.read_csv('/Dataset/train.csv')
valid_df = pd.read_csv('/Dataset/valid.csv')
test_df = pd.read_csv('/Dataset/test.csv')

# Convert labels to binary format
def preprocess_labels(df):
    return df[DIMS].values

train_labels = preprocess_labels(train_df)
valid_labels = preprocess_labels(valid_df)
test_labels = preprocess_labels(test_df)

train_texts = train_df['post_text'].tolist()
valid_texts = valid_df['post_text'].tolist()
test_texts = test_df['post_text'].tolist()

# Calculate class weights to handle label imbalance
label_counts = train_labels.sum(axis=0)
total_counts = len(train_labels)
class_weights = torch.tensor(total_counts / (len(DIMS) * label_counts), dtype=torch.float)

# Dataset class
class ArgumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.float)
        }

# Define datasets
max_len = 256
train_dataset = ArgumentDataset(train_texts, train_labels, tokenizer, max_len)
valid_dataset = ArgumentDataset(valid_texts, valid_labels, tokenizer, max_len)
test_dataset = ArgumentDataset(test_texts, test_labels, tokenizer, max_len)

# Define evaluation metrics
# Define the compute metrics function for evaluation
from sklearn.metrics import accuracy_score, f1_score
from scipy.special import expit  # sigmoid function

# Define the new compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities and round to get binary predictions
    predictions = torch.round(torch.sigmoid(torch.from_numpy(logits))).numpy()
    
    out_dict = {}
    prec = 0
    rec = 0
    macroF1 = 0
    
    # Loop over each label dimension and compute scores
    for i, dim in enumerate(DIMS):
        scores = precision_recall_fscore_support(labels[:, i], predictions[:, i], average='macro', zero_division=0)
        prec += scores[0]
        rec += scores[1]
        macroF1 += scores[2]
        out_dict[f'{dim}_precision'] = scores[0]
        out_dict[f'{dim}_recall'] = scores[1]
        out_dict[f'{dim}_macroF1'] = scores[2]
        
    # Calculate the average precision, recall, and F1
    out_dict['mean_precision'] = prec / len(DIMS)
    out_dict['mean_recall'] = rec / len(DIMS)
    out_dict['mean_F1'] = macroF1 / len(DIMS)
    
    return out_dict

# Update training arguments with gradient accumulation, weight decay, and additional epochs
training_args = TrainingArguments(
    output_dir='./fine_tuned_results',
    num_train_epochs=5,  # Increase epochs to improve rare label learning
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,  # Effective batch size of 32
    learning_rate=2e-5,  # Lower learning rate
    warmup_steps=500,  # Stabilize learning in initial steps
    weight_decay=0.02,  # Regularization to prevent overfitting
    logging_dir='./fine_tuned_logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    fp16=True,  # Mixed precision for faster training
    save_total_limit=1
)

# Initialize Trainer with class weights in the loss function
from torch.nn import BCEWithLogitsLoss

# Focal Loss Implementation
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, alpha=None):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha  # Optional weighting factor for positive/negative samples

    def forward(self, logits, labels):
        bce_loss = nn.functional.binary_cross_entropy_with_logits(logits, labels, reduction='none')
        pt = torch.exp(-bce_loss)  # Probabilities of correct predictions
        focal_loss = (1 - pt) ** self.gamma * bce_loss
        if self.alpha is not None:
            alpha_t = self.alpha * labels + (1 - self.alpha) * (1 - labels)
            focal_loss *= alpha_t
        return focal_loss.mean()

# Asymmetric Loss Implementation
class AsymmetricLoss(nn.Module):
    def __init__(self, gamma_neg=4, gamma_pos=1):
        super(AsymmetricLoss, self).__init__()
        self.gamma_neg = gamma_neg
        self.gamma_pos = gamma_pos

    def forward(self, logits, labels):
        probas = torch.sigmoid(logits)
        pos_loss = (1 - probas) ** self.gamma_pos * labels * torch.log(probas + 1e-8)
        neg_loss = probas ** self.gamma_neg * (1 - labels) * torch.log(1 - probas + 1e-8)
        loss = -torch.mean(pos_loss + neg_loss)
        return loss

# Weighted Trainer Class
class WeightedTrainer(Trainer):
    def __init__(self, *args, loss_fn="focal", **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = loss_fn
        if loss_fn == "focal":
            self.criterion = FocalLoss(gamma=2)  # Adjust gamma as needed
        elif loss_fn == "asl":
            self.criterion = AsymmetricLoss(gamma_neg=4, gamma_pos=1)  # Adjust gammas as needed
        else:
            # Default to BCEWithLogitsLoss with class weights
            self.criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights.to(self.model.device))

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.criterion(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Initialize weighted Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    loss_fn="asl"  
)

# Fine-tune the model
trainer.train()

# Evaluate the model on the validation set
results = trainer.evaluate(valid_dataset)
print(f"Validation results: {results}")

# Evaluate on the test set for final performance
test_results = trainer.evaluate(test_dataset)
print(f"Test results: {test_results}")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Inappropriateness Precision,Inappropriateness Recall,Inappropriateness Macrof1,Toxic emotions Precision,Toxic emotions Recall,Toxic emotions Macrof1,Excessive intensity Precision,Excessive intensity Recall,Excessive intensity Macrof1,Emotional deception Precision,Emotional deception Recall,Emotional deception Macrof1,Missing commitment Precision,Missing commitment Recall,Missing commitment Macrof1,Missing seriousness Precision,Missing seriousness Recall,Missing seriousness Macrof1,Missing openness Precision,Missing openness Recall,Missing openness Macrof1,Missing intelligibility Precision,Missing intelligibility Recall,Missing intelligibility Macrof1,Unclear meaning Precision,Unclear meaning Recall,Unclear meaning Macrof1,Missing relevance Precision,Missing relevance Recall,Missing relevance Macrof1,Confusing reasoning Precision,Confusing reasoning Recall,Confusing reasoning Macrof1,Other reasons Precision,Other reasons Recall,Other reasons Macrof1,Detrimental orthography Precision,Detrimental orthography Recall,Detrimental orthography Macrof1,Reason unclassified Precision,Reason unclassified Recall,Reason unclassified Macrof1,Mean Precision,Mean Recall,Mean F1
1,0.109,0.110965,0.218182,0.5,0.303797,0.361364,0.5,0.419525,0.090909,0.5,0.153846,0.45246,0.463068,0.261439,0.334091,0.5,0.400545,0.459091,0.5,0.478673,0.35,0.5,0.411765,0.550194,0.547941,0.548421,0.395455,0.5,0.441624,0.885845,0.509804,0.454798,0.480641,0.432483,0.353491,0.596061,0.61244,0.603365,0.521277,0.575472,0.171964,0.493182,0.5,0.496568,0.442054,0.510086,0.392844
2,0.0911,0.090886,0.279817,0.491935,0.356725,0.658095,0.684452,0.6633,0.397751,0.368056,0.263786,0.476165,0.465909,0.376005,0.677273,0.699888,0.667879,0.867442,0.608636,0.655927,0.6375,0.651515,0.641226,0.600818,0.548935,0.405664,0.664678,0.667291,0.665954,0.587655,0.605117,0.591089,0.461187,0.497537,0.478673,0.475,0.5,0.487179,0.481818,0.5,0.490741,0.493182,0.5,0.496568,0.55417,0.556376,0.517194
3,0.0857,0.085223,0.447773,0.497144,0.367872,0.709761,0.725333,0.716129,0.696078,0.668056,0.679743,0.714821,0.707386,0.710945,0.70812,0.726913,0.71252,0.974178,0.694444,0.766747,0.664861,0.679654,0.66967,0.642857,0.656013,0.636779,0.606393,0.614568,0.60987,0.61561,0.632672,0.621139,0.842593,0.585772,0.624957,0.475,0.5,0.487179,0.481818,0.5,0.490741,0.493182,0.5,0.496568,0.648075,0.620568,0.613633
4,0.0809,0.082181,0.659722,0.511593,0.391765,0.735681,0.75116,0.742267,0.713482,0.755556,0.72984,0.671245,0.693182,0.680233,0.692454,0.716988,0.682405,0.806288,0.790704,0.798236,0.671419,0.703463,0.664488,0.626074,0.612586,0.538765,0.62,0.67991,0.600224,0.580833,0.612542,0.561401,0.567674,0.614894,0.579082,0.977169,0.545455,0.571651,0.984018,0.5625,0.60299,0.493182,0.5,0.496568,0.699946,0.646467,0.617137
5,0.0723,0.08157,0.725287,0.705813,0.708255,0.75049,0.75049,0.75049,0.711242,0.745833,0.725343,0.686418,0.701705,0.693223,0.723232,0.723232,0.723232,0.849593,0.795655,0.81982,0.697949,0.701299,0.699542,0.638659,0.649964,0.619258,0.665057,0.724263,0.675121,0.6125,0.646189,0.614121,0.617502,0.617502,0.617502,0.781395,0.631579,0.675708,0.650538,0.557783,0.58042,0.493182,0.5,0.496568,0.685932,0.675093,0.671329


Validation results: {'eval_loss': 0.08157048374414444, 'eval_Inappropriateness_precision': 0.725287356321839, 'eval_Inappropriateness_recall': 0.7058131720430108, 'eval_Inappropriateness_macroF1': 0.708254527272332, 'eval_Toxic Emotions_precision': 0.7504897412104341, 'eval_Toxic Emotions_recall': 0.7504897412104341, 'eval_Toxic Emotions_macroF1': 0.7504897412104341, 'eval_Excessive Intensity_precision': 0.7112423916935195, 'eval_Excessive Intensity_recall': 0.7458333333333333, 'eval_Excessive Intensity_macroF1': 0.7253433208489388, 'eval_Emotional Deception_precision': 0.6864184270199307, 'eval_Emotional Deception_recall': 0.7017045454545454, 'eval_Emotional Deception_macroF1': 0.6932230175699545, 'eval_Missing Commitment_precision': 0.7232317584568073, 'eval_Missing Commitment_recall': 0.7232317584568073, 'eval_Missing Commitment_macroF1': 0.7232317584568073, 'eval_Missing Seriousness_precision': 0.8495934959349594, 'eval_Missing Seriousness_recall': 0.7956545654565457, 'eval_Missing