In [None]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, AutoTokenizer, AutoModel
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import numpy as np
import os

In [None]:
# Mount Google Drive (Colab specific)
from google.colab import drive
drive.mount('/content/drive')
data_dir = "/content/drive/My Drive/SEMEVAL/data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load data
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

train_data = load_data(os.path.join(data_dir, "training_set_task1.txt"))
dev_data = load_data(os.path.join(data_dir, "dev_set_task1.txt"))

# Preprocess data
train_texts = [item["text"] for item in train_data]
dev_texts = [item["text"] for item in dev_data]
train_labels = [item["labels"] for item in train_data]
dev_labels = [item["labels"] for item in dev_data]

# Encode labels
all_labels = sorted(set(label for labels in train_labels + dev_labels for label in labels))
mlb = MultiLabelBinarizer(classes=all_labels)
train_labels_enc = mlb.fit_transform(train_labels)
dev_labels_enc = mlb.transform(dev_labels)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_texts(texts, max_length=128):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = tokenize_texts(train_texts)
dev_encodings = tokenize_texts(dev_texts)

In [None]:
# Dataset class
class PropagandaDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = PropagandaDataset(train_encodings, train_labels_enc)
dev_dataset = PropagandaDataset(dev_encodings, dev_labels_enc)

# Model architecture (as per paper)
class BertForMultilabelClassification(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.Tanh(),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        return self.classifier(pooled_output)

In [None]:
# Initialize model
num_labels = len(all_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMultilabelClassification(num_labels).to(device)

# Class weights for imbalance
pos_weights = torch.tensor(
    [(len(train_labels) - np.sum(train_labels_enc[:, i])) / np.sum(train_labels_enc[:, i])
     for i in range(num_labels)],
    dtype=torch.float
).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

# Optimizer (paper settings)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)



In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train the model for one epoch"""
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(train_loader)





In [None]:
def evaluate_model(model, dev_loader, criterion, device):
    """Evaluate the model on the validation set"""
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            all_preds.append(torch.sigmoid(outputs).cpu())
            all_labels.append(labels.cpu())

    # Calculate metrics
    avg_val_loss = val_loss / len(dev_loader)

    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()
    preds_binary = (all_preds > 0.5).astype(int)

    f1_micro = f1_score(all_labels, preds_binary, average='micro')

    return avg_val_loss, f1_micro

In [None]:
def train_model(model, train_loader, dev_loader, criterion, optimizer, device, epochs=20, patience=10):
    """Main training loop with early stopping"""
    best_f1 = 0
    wait = 0

    for epoch in range(epochs):
        # Training phase
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)

        # Evaluation phase
        val_loss, val_f1 = evaluate_model(model, dev_loader, criterion, device)

        print(f"Epoch {epoch+1}")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        print(f"Val F1 Micro: {val_f1:.4f}")

        # Early stopping logic
        if val_f1 > best_f1:
            best_f1 = val_f1
            wait = 0
            torch.save(model.state_dict(), "best_model.pth")
            print("New best model saved!")
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

# Start training (modified call)
train_model(
    model=model,
    train_loader=train_loader,
    dev_loader=dev_loader,
    criterion=criterion,
    optimizer=optimizer,
    device=device
)

print("Training complete!")

Epoch 1
Train Loss: 1.2884 | Val Loss: 1.7855
Val F1 Micro: 0.2509
New best model saved!
Epoch 2
Train Loss: 1.2751 | Val Loss: 1.7292
Val F1 Micro: 0.2614
New best model saved!
Epoch 3
Train Loss: 1.2179 | Val Loss: 1.7241
Val F1 Micro: 0.2372
Epoch 4
Train Loss: 1.1703 | Val Loss: 1.6091
Val F1 Micro: 0.2148
Epoch 5
Train Loss: 1.1504 | Val Loss: 1.9270
Val F1 Micro: 0.1377
Epoch 6
Train Loss: 1.1280 | Val Loss: 1.4450
Val F1 Micro: 0.2016
Epoch 7
Train Loss: 1.0440 | Val Loss: 1.4803
Val F1 Micro: 0.2273
Early stopping at epoch 7
Training complete!
