In [1]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter
import pandas as pd
import re
import string
import emoji
# import contractions
from nltk.corpus import stopwords
import nltk
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Clean text function
# def clean_text(text):
#     text = text.lower()
#     text = contractions.fix(text)
#     text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
#     text = re.sub(r'@\w+', '<USER>', text)
#     text = re.sub(r'#(\w+)', r'\1', text)
#     text = emoji.demojize(text)
#     text = re.sub(r"([a-z])\1{2,}", r"\1\1", text)
#     text = text.translate(str.maketrans('', '', string.punctuation))
#     text = re.sub(r'\s+', ' ', text).strip()
#     text = " ".join([word for word in text.split() if word not in stop_words])
#     return text

# Load and preprocess data
# train_data = pd.read_csv('/kaggle/input/offensive-semeval-task-1/train.tsv', sep='\t' )
# dev_data = pd.read_csv('/kaggle/input/offensive-semeval-task-1/dev.tsv', sep='\t')
# test_data = pd.read_csv('/kaggle/input/offensive-semeval-task-1/test.tsv', sep='\t', names = ['Data', 'Label'])

# Load senti append data
train_data = pd.read_csv('/kaggle/input/sentiment-append-off-set/mrm_senti_append_train.tsv', sep='\t')
dev_data = pd.read_csv('/kaggle/input/sentiment-append-off-set/mrm_senti_append_dev.tsv', sep='\t')
test_data = pd.read_csv('/kaggle/input/sentiment-append-off-set/mrm_senti_append_test.tsv', sep='\t')

X_train, y_train = train_data.iloc[:, 0], train_data.iloc[:, 1]
X_dev, y_dev = dev_data.iloc[:, 0], dev_data.iloc[:, 1]
X_test, y_test = test_data.iloc[:, 0], test_data.iloc[:, 1]

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_dev = label_encoder.transform(y_dev)
y_test = label_encoder.transform(y_test)

# Load DeBERTa model and tokenizer
# lm = "microsoft/deberta-v3-large"
lm = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(lm)

def tokenize_inputs(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=77, return_tensors="pt")

train_encodings = tokenize_inputs(X_train.tolist())
dev_encodings = tokenize_inputs(X_dev.tolist())
test_encodings = tokenize_inputs(X_test.tolist())

y_train_tensor = torch.tensor(y_train)
y_dev_tensor = torch.tensor(y_dev)
y_test_tensor = torch.tensor(y_test)

batch_size = 16
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], y_train_tensor)
dev_dataset = TensorDataset(dev_encodings['input_ids'], dev_encodings['attention_mask'], y_dev_tensor)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Focal loss function for handling class imbalance
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = torch.nn.functional.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean() if self.reduction == 'mean' else F_loss.sum()

    
# Multi-task learning model setup
# Multi-task learning model setup
class DeBERTaModel(torch.nn.Module):
    def __init__(self, model_name, num_main_labels):
        super(DeBERTaModel, self).__init__()
        # Load the DeBERTa model with hidden states enabled
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_main_labels,
            output_hidden_states=False
#             output_hidden_states=True  # Enable hidden states
        )
        
    def forward(self, input_ids, attention_mask, labels=None, aux_labels=None):
        # Pass inputs through the DeBERTa model
        outputs = self.bert(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        #hidden_states = outputs.hidden_states  # Retrieve the hidden states

        ## Use the last layer's hidden states and average across tokens
        #aux_logits = self.aux_classifier(hidden_states[-1].mean(dim=1))

        #return logits, aux_logits
        return logits


num_labels = len(label_encoder.classes_)
model = DeBERTaModel(model_name=lm, num_main_labels=num_labels)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.3)
num_epochs = 5
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training with Focal Loss and Early Stopping
loss_fn = FocalLoss(alpha=1.0, gamma=2.0).to(device)
early_stop_tolerance = 2
no_improvement_epochs = 0
best_f1 = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        #logits, aux_logits = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1} - Training Loss: {total_loss / len(train_loader)}")

    # Validation phase
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            #logits, aux_logits = model(input_ids, attention_mask=attention_mask)
            logits = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds, average='macro')
    precision = precision_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    print(f'Validation - Accuracy: {accuracy:.4f} | Recall: {recall:.4f} | Precision: {precision:.4f} | F1 Score: {f1:.4f}')

    if f1 > best_f1:
        best_f1 = f1
        no_improvement_epochs = 0
    else:
        no_improvement_epochs += 1
        if no_improvement_epochs >= early_stop_tolerance:
            print("Early stopping triggered")
            break

# Test Evaluation
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        #logits, aux_logits = model(input_ids, attention_mask=attention_mask)
        logits = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds, average='macro')
precision = precision_score(all_labels, all_preds, average='macro')
f1 = f1_score(all_labels, all_preds, average='macro')

test_data['actual_label'] = all_labels
test_data['predicted_label'] = all_preds

test_data.to_csv( 'pps_olid_test_outputs.tsv', sep='\t', index=False)

print(f'Test - Accuracy: {accuracy:.4f} | Recall: {recall:.4f} | Precision: {precision:.4f} | F1 Score: {f1:.4f}')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.1507037171010488
Validation - Accuracy: 0.7470 | Recall: 0.7710 | Precision: 0.7473 | F1 Score: 0.7414
Epoch 2 - Training Loss: 0.11539890218550473
Validation - Accuracy: 0.8180 | Recall: 0.7953 | Precision: 0.8019 | F1 Score: 0.7984
Epoch 3 - Training Loss: 0.0967629640539682
Validation - Accuracy: 0.7970 | Recall: 0.7921 | Precision: 0.7783 | F1 Score: 0.7834
Epoch 4 - Training Loss: 0.07914789376256902
Validation - Accuracy: 0.7960 | Recall: 0.7784 | Precision: 0.7762 | F1 Score: 0.7773
Early stopping triggered
Test - Accuracy: 0.8442 | Recall: 0.8013 | Precision: 0.8077 | F1 Score: 0.8044
