In [None]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    AdamW
)
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from typing import Dict
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:


# Configuration for all models
MODEL_CONFIGS = {
    "phi-3-mini": {
        "name": "microsoft/Phi-3-mini-4k-instruct",
        "quantize": True,
        "max_length": 512,
        "batch_size": 2
    }
}

class PropagandaDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2):
        super().__init__()
        self.alpha = alpha  # Tensor of shape [num_classes]
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        focal_loss = (1 - pt) ** self.gamma * BCE_loss

        if self.alpha is not None:
            # Move alpha to correct device and expand dimensions
            alpha = self.alpha.to(inputs.device).unsqueeze(0)
            focal_loss = alpha * focal_loss

        return focal_loss.mean()

def load_data(file_path: str) -> list:
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def prepare_datasets(config: Dict, tokenizer, train_texts, train_labels, dev_texts, dev_labels):
    def tokenize(texts):
        return tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=config["max_length"],
            return_tensors="pt"
        )

    train_encodings = tokenize(train_texts)
    dev_encodings = tokenize(dev_texts)

    return (
        PropagandaDataset(train_encodings, train_labels),
        PropagandaDataset(dev_encodings, dev_labels)
    )

def train_model(model_config: Dict, train_data, dev_data, all_labels):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load model
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=model_config["quantize"],
        bnb_4bit_compute_dtype=torch.bfloat16
    ) if model_config["quantize"] else None

    model = AutoModelForSequenceClassification.from_pretrained(
        model_config["name"],
        num_labels=len(all_labels),
        quantization_config=bnb_config,
        device_map="auto"
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_config["name"])
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    # Prepare data
    train_dataset, dev_dataset = prepare_datasets(
        model_config, tokenizer, *train_data, *dev_data
    )

    # Calculate class weights for focal loss
    class_counts = np.sum(train_data[1], axis=0)
    epsilon = 1e-6  # Smoothing factor
    alpha = (1.0 / (class_counts + epsilon))  # Inverse frequency weighting
    alpha = alpha / alpha.sum() * len(all_labels)  # Normalize
    class_weights = torch.tensor(alpha, dtype=torch.float32)

    train_loader = DataLoader(
        train_dataset,
        batch_size=model_config["batch_size"],
        shuffle=True
    )

    dev_loader = DataLoader(
        dev_dataset,
        batch_size=model_config["batch_size"],
        shuffle=False
    )

    # Initialize training components
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)
    criterion = FocalLoss(alpha=class_weights, gamma=2)

    # Training loop
    best_f1 = 0
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in dev_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                labels = batch['labels'].cpu().numpy()

                outputs = model(**inputs).logits
                preds = torch.sigmoid(outputs).cpu().numpy()

                all_preds.append(preds)
                all_labels.append(labels)

        # Calculate metrics
        preds = np.concatenate(all_preds)
        labels = np.concatenate(all_labels)
        preds_binary = (preds > 0.5).astype(int)

        # Compute metrics
        f1_micro = f1_score(labels, preds_binary, average='micro')
        f1_macro = f1_score(labels, preds_binary, average='macro')
        precision = precision_score(labels, preds_binary, average='micro')
        recall = recall_score(labels, preds_binary, average='micro')

        # Print metrics
        print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")
        print(f"  F1-Micro: {f1_micro:.4f} | F1-Macro: {f1_macro:.4f}")
        print(f"  Precision: {precision:.4f} | Recall: {recall:.4f}")

        # Save best model
        if f1_micro > best_f1:
            best_f1 = f1_micro
            torch.save(model.state_dict(), f"best_{model_config['name'].replace('/', '_')}.pth")

    return best_f1



In [None]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = "/content/drive/My Drive/SEMEVAL/data/"

# Load data
def load_data(file_path: str) -> list:
    full_path = data_dir + file_path  # <-- Use data_dir to construct the full path
    with open(full_path, "r", encoding="utf-8") as f:
        return json.load(f)


# Model configuration
NUM_EPOCHS = 10


# === Main Execution ===
# Set the model you want to train here
#MODEL_TO_TRAIN = "phi-3-mini"  # Change to "deberta-v3", "mistral-7b", or "all"
MODEL_TO_TRAIN = "phi-3-mini"

# Load data
train_data = load_data("training_set_task1.txt")  # <-- Now uses the full path
dev_data = load_data("dev_set_task1.txt")  # <-- Now uses the full path

train_texts = [item["text"] for item in train_data]
train_labels = [item["labels"] for item in train_data]
dev_texts = [item["text"] for item in dev_data]
dev_labels = [item["labels"] for item in dev_data]

# Encode labels
all_labels = sorted({label for labels in train_labels + dev_labels for label in labels})
mlb = MultiLabelBinarizer(classes=all_labels)
train_labels_enc = mlb.fit_transform(train_labels)
dev_labels_enc = mlb.transform(dev_labels)

# Run training
if MODEL_TO_TRAIN == 'all':
    results = {}
    for model_name, config in MODEL_CONFIGS.items():
        print(f"\n=== Training {model_name} ===")
        score = train_model(config, (train_texts, train_labels_enc), (dev_texts, dev_labels_enc), all_labels)
        results[model_name] = score
    print("\nTraining results:", results)
else:
    if MODEL_TO_TRAIN not in MODEL_CONFIGS:
        raise ValueError(f"Unknown model: {MODEL_TO_TRAIN}. Available: {list(MODEL_CONFIGS.keys())}")
    print(f"\n=== Training {MODEL_TO_TRAIN} ===")
    score = train_model(MODEL_CONFIGS[MODEL_TO_TRAIN], (train_texts, train_labels_enc), (dev_texts, dev_labels_enc), all_labels)
    print(f"\nFinal {MODEL_TO_TRAIN} F1: {score:.4f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

=== Training phi-3-mini ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1 | Loss: 0.0516
  F1-Micro: 0.3551 | F1-Macro: 0.1011
  Precision: 0.4176 | Recall: 0.3089


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2 | Loss: 0.0187
  F1-Micro: 0.3828 | F1-Macro: 0.0725
  Precision: 0.4651 | Recall: 0.3252


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3 | Loss: 0.0126
  F1-Micro: 0.3459 | F1-Macro: 0.0903
  Precision: 0.5161 | Recall: 0.2602


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4 | Loss: 0.0108
  F1-Micro: 0.4163 | F1-Macro: 0.1031
  Precision: 0.4694 | Recall: 0.3740


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5 | Loss: 0.0077
  F1-Micro: 0.3636 | F1-Macro: 0.1028
  Precision: 0.4800 | Recall: 0.2927


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6 | Loss: 0.0044
  F1-Micro: 0.4454 | F1-Macro: 0.1184
  Precision: 0.4811 | Recall: 0.4146


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7 | Loss: 0.0033
  F1-Micro: 0.3529 | F1-Macro: 0.0918
  Precision: 0.5156 | Recall: 0.2683


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 8 | Loss: 0.0026
  F1-Micro: 0.4511 | F1-Macro: 0.1137
  Precision: 0.4732 | Recall: 0.4309
