In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

new_columns = [
    "par_id",      # 1 (integer ID)
    "art_id",      # @@24942188 (article identifier)
    "topic",       # hopeless (PCL category)
    "country",     # ph (country code)
    "text",        # Full text content
    "label"        # 0 (binary label)
]

# Read main dataset - skip 4 disclaimer rows
df = pd.read_csv(
    "data/dontpatronizeme_pcl.tsv",
    sep="\t",
    header=None,
    skiprows=4,
    names=new_columns,
    on_bad_lines='warn'
)

# Read train/dev splits
train_val_labels = pd.read_csv("data/train_semeval_parids-labels.csv")
test_labels = pd.read_csv("data/dev_semeval_parids-labels.csv")

# Convert string labels to lists
def parse_labels(label_str: str) -> list[int]:
    return [int(x) for x in label_str.strip("[]").replace(" ", "").split(",")]

# Process labels dataframes
for labels_df in [train_val_labels, test_labels]:
    labels_df['labels'] = labels_df['label'].apply(parse_labels)
    labels_df.drop('label', axis=1, inplace=True)

# Join with main data
train_val_df = df.merge(train_val_labels, on="par_id", how="inner")
test_df = df.merge(test_labels, on="par_id", how="inner")

# Add PCL positivity column to both dataframes
train_val_df['pcl_label'] = train_val_df['label'].apply(
    lambda x: 0 if x in {0, 1} else 1)
test_df['pcl_label'] = test_df['label'].apply(
    lambda x: 0 if x in {0, 1} else 1)

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42, shuffle=True)

### Hyperparameters

In [None]:
batch_size = 32
lr = 8e-5
betas = (0.9, 0.98)
n_epochs = 2
eps = 1e-6
wd = 8e-6

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer



class PCLDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, balance_method='oversample'):
        self.tokenizer = tokenizer
        self.max_length = max_length

         # Split into positive and negative classes
        pos_df = dataframe[dataframe['pcl_label'] == 1]
        neg_df = dataframe[dataframe['pcl_label'] == 0]
        
        # Balance classes
        if balance_method == 'oversample':
            # Repeat minority class samples
            if len(pos_df) > len(neg_df):
                pos_df, neg_df = neg_df, pos_df
            n_samples = max(len(pos_df), len(neg_df))
            pos_df = pos_df.sample(n_samples, replace=True, random_state=42)
        elif balance_method == 'undersample':
            # Take minimum number of samples
            n_samples = min(len(pos_df), len(neg_df))
            pos_df = pos_df.sample(n_samples, random_state=42)
            neg_df = neg_df.sample(n_samples, random_state=42)
        
        # Combine and shuffle
        balanced_df = pd.concat([pos_df, neg_df]).sample(frac=1, random_state=42)
        self.texts = balanced_df['text'].tolist()
        self.labels = balanced_df['pcl_label'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and datasets
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

# Create datasets
train_dataset = PCLDataset(train_df, tokenizer)
val_dataset = PCLDataset(val_df, tokenizer)
test_dataset = PCLDataset(test_df, tokenizer)

# Create dataloaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Example usage:
if __name__ == "__main__":
    sample_batch = next(iter(train_loader))
    print("Batch keys:", sample_batch.keys())
    print("Input shape:", sample_batch['input_ids'].shape)
    print("Label shape:", sample_batch['labels'].shape)

Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input shape: torch.Size([16, 512])
Label shape: torch.Size([16])


In [6]:
# Add after the dataloader creation cell
from transformers import AutoModelForSequenceClassification, Adafactor
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import torch.nn as nn

# Initialize model with classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base",
    num_labels=2  # Binary classification
)

# Training setup
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model.to(device)
optimizer = Adafactor(model.parameters())
loss_fn = nn.CrossEntropyLoss()

# Training loop
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = loss_fn(logits, labels)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
    return total_loss / len(dataloader)

# Validation function
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            loss = loss_fn(logits, labels)
            total_loss += loss.item()
            
            batch_preds = torch.argmax(logits, dim=1)
            predictions.extend(batch_preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average='binary'
    )
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Training parameters
epochs = 100
best_f1 = -1

# Main training loop
for epoch in tqdm(range(epochs)):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_metrics = evaluate(model, val_loader, device)
    
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_metrics['loss']:.4f} | Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Precision: {val_metrics['precision']:.4f} | Recall: {val_metrics['recall']:.4f} | F1: {val_metrics['f1']:.4f}")
    print("--------------------------------")
    
    # Save best model
    if val_metrics['f1'] > best_f1:
        best_f1 = val_metrics['f1']
        torch.save(model.state_dict(), "best_model.pth")

# Final test evaluation
model.load_state_dict(torch.load("best_model.pth"))
test_metrics = evaluate(model, test_loader, device)

print("\\nFinal Test Results:")
print(f"Accuracy: {test_metrics['accuracy']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"F1: {test_metrics['f1']:.4f}")

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1/100
Train Loss: 0.3601
Val Loss: 2.1405 | Accuracy: 0.5830
Precision: 0.8720 | Recall: 0.1946 | F1: 0.3181
--------------------------------


  1%|          | 1/100 [35:04<57:52:44, 2104.69s/it]

Epoch 2/100
Train Loss: 0.0346
Val Loss: 1.7565 | Accuracy: 0.6232
Precision: 0.8429 | Recall: 0.3028 | F1: 0.4455
--------------------------------


  3%|▎         | 3/100 [1:51:28<60:34:19, 2248.03s/it]

Epoch 3/100
Train Loss: 0.0223
Val Loss: 2.9144 | Accuracy: 0.5704
Precision: 0.8955 | Recall: 0.1594 | F1: 0.2706
--------------------------------


  4%|▍         | 4/100 [2:29:30<60:18:14, 2261.40s/it]

Epoch 4/100
Train Loss: 0.0101
Val Loss: 3.0011 | Accuracy: 0.5906
Precision: 0.8273 | Recall: 0.2291 | F1: 0.3588
--------------------------------


  5%|▌         | 5/100 [3:07:54<60:04:50, 2276.74s/it]

Epoch 5/100
Train Loss: 0.0091
Val Loss: 4.6347 | Accuracy: 0.5677
Precision: 0.8592 | Recall: 0.1620 | F1: 0.2726
--------------------------------


  6%|▌         | 6/100 [3:46:44<59:55:30, 2295.00s/it]

Epoch 6/100
Train Loss: 0.0102
Val Loss: 4.6411 | Accuracy: 0.5704
Precision: 0.8926 | Recall: 0.1600 | F1: 0.2714
--------------------------------


  7%|▋         | 7/100 [4:25:22<59:29:00, 2302.58s/it]

Epoch 7/100
Train Loss: 0.0025
Val Loss: 5.7060 | Accuracy: 0.5764
Precision: 0.8783 | Recall: 0.1773 | F1: 0.2950
--------------------------------


  8%|▊         | 8/100 [5:08:16<61:03:05, 2388.98s/it]

Epoch 8/100
Train Loss: 0.0058
Val Loss: 6.4807 | Accuracy: 0.5717
Precision: 0.8600 | Recall: 0.1713 | F1: 0.2857
--------------------------------


  9%|▉         | 9/100 [5:48:18<60:29:28, 2393.06s/it]

Epoch 9/100
Train Loss: 0.0062
Val Loss: 6.3390 | Accuracy: 0.5631
Precision: 0.8598 | Recall: 0.1507 | F1: 0.2565
--------------------------------


 10%|█         | 10/100 [6:27:49<59:39:20, 2386.23s/it]

Epoch 10/100
Train Loss: 0.0038
Val Loss: 7.7128 | Accuracy: 0.5578
Precision: 0.8199 | Recall: 0.1481 | F1: 0.2508
--------------------------------


 11%|█         | 11/100 [7:04:06<57:24:23, 2322.06s/it]

Epoch 11/100
Train Loss: 0.0026
Val Loss: 9.0117 | Accuracy: 0.5568
Precision: 0.8327 | Recall: 0.1421 | F1: 0.2428
--------------------------------


 12%|█▏        | 12/100 [7:40:17<55:38:13, 2276.07s/it]

Epoch 12/100
Train Loss: 0.0030
Val Loss: 7.0742 | Accuracy: 0.5976
Precision: 0.8281 | Recall: 0.2463 | F1: 0.3797
--------------------------------


 13%|█▎        | 13/100 [8:17:00<54:28:29, 2254.14s/it]

Epoch 13/100
Train Loss: 0.0041
Val Loss: 7.3406 | Accuracy: 0.5857
Precision: 0.8162 | Recall: 0.2211 | F1: 0.3480
--------------------------------


 13%|█▎        | 13/100 [8:21:22<55:55:23, 2314.07s/it]


KeyboardInterrupt: 

In [20]:
import torch.nn.functional as F

model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base",
    num_labels=2  # Binary classification
)

model.load_state_dict(torch.load("best_model.pth"))

# Evaluation on a single example
def predict_single(text: str, model, tokenizer, device):
    model.to(device)
    model.eval()
    
    encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    return F.softmax(logits).cpu().numpy()

test_input = df[df['label'] == 1]['text'].iloc[3]
print(test_input)
predict_single(test_input, model, tokenizer, device)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Comrade David Kerigbo Ugondo was born to the family of Late Pa Akerigbo Adikpo and wife , Mrs. Pam Akerigbo Adikpo on the 25th day of October 1950 in Achagh Mbaduku in Vandeikya Local Government of Benue State . His 67th birthday would be 25th of October . A well secured Nigeria gave birth and nurtured Comrade David into 27 years of meritorious service to industry and unionism . It is tragic that an unsecured Nigeria in 2017 made him vulnerable to day-light gunshots on Sunday September 10 , 2017 by criminal armed robbers who attacked around Birnin Gwari town of Kaduna State . May God grant him and other victims which included an Army Captain and infant baby eternal rest in paradise .


  return F.softmax(logits).cpu().numpy()


array([[0.9708754 , 0.02912457]], dtype=float32)