<a href="https://colab.research.google.com/github/Hasasasaki/semeval_2022_task_4/blob/main/model_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [144]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'


In [145]:
# !pip install "flash_attn==2.6.3" --no-build-isolation
!pip install deep_translator



## Data processing

In [146]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    TrainerCallback,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from tqdm import tqdm
import random
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter

seed = 42

def reset_seeds(seed=seed):
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    import random
    random.seed(seed)
    import numpy as np
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)

    print(f"All random states have been reset with seed {seed}")

reset_seeds()

new_columns = [
    "par_id",      # 1 (integer ID)
    "art_id",      # @@24942188 (article identifier)
    "topic",       # hopeless (PCL category)
    "country",     # ph (country code)
    "text",        # Full text content
    "label"        # 0 (binary label)
]

# Read main dataset - skip 4 disclaimer rows
df = pd.read_csv(
    "data/dontpatronizeme_pcl.tsv",
    sep="\t",
    header=None,
    skiprows=4,
    names=new_columns,
    on_bad_lines='warn'
)

# Read train/dev splits
train_val_labels = pd.read_csv("data/train_semeval_parids-labels.csv")
test_labels = pd.read_csv("data/dev_semeval_parids-labels.csv")

# Convert string labels to lists
def parse_labels(label_str: str) -> list[int]:
    return [int(x) for x in label_str.strip("[]").replace(" ", "").split(",")]

# Process labels dataframes
for labels_df in [train_val_labels, test_labels]:
    labels_df['labels'] = labels_df['label'].apply(parse_labels)
    labels_df.drop('label', axis=1, inplace=True)

# Join with main data
train_val_df = df.merge(train_val_labels, on="par_id", how="inner")
test_df = df.merge(test_labels, on="par_id", how="inner")

# Add PCL positivity column to both dataframes
train_val_df['pcl_label'] = train_val_df['label'].apply(
    lambda x: 0 if x in {0, 1} else 1)
test_df['pcl_label'] = test_df['label'].apply(
    lambda x: 0 if x in {0, 1} else 1)

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42, shuffle=True)

RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import nltk
# os.environ["TRANSFORMERS_CACHE"] = "/vol/bitbucket/bj321/.cache"
# nltk.data.path.append("/vol/bitbucket/bj321/nltk_data")  # Your custom path
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


In [143]:
import gc
torch.cuda.empty_cache()
gc.collect()

RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
from deep_translator import GoogleTranslator
import concurrent.futures
import time
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Load SentenceBERT model
print("Loading SentenceBERT model...")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Smaller, faster model
# Alternative: model = SentenceTransformer('paraphrase-mpnet-base-v2')  # More accurate but slower

def compute_similarity(original, translated):
    """Compute cosine similarity between original and translated text embeddings"""
    # Get embeddings
    emb1 = model.encode([original])[0]
    emb2 = model.encode([translated])[0]

    # Compute cosine similarity (1 - cosine distance)
    similarity = 1 - cosine(emb1, emb2)
    return similarity

def back_translate_single(item):
    """Process a single text item with similarity filtering"""
    text, label, par_id, source, target, idx, similarity_threshold = item
    try:
        # First translation (source to target)
        translated = GoogleTranslator(source=source, target=target).translate(text)
        time.sleep(0.5)  # Avoid rate limiting

        # Second translation (target back to source)
        back_translated = GoogleTranslator(source=target, target=source).translate(translated)

        # Compute semantic similarity
        similarity = compute_similarity(text, back_translated)

        # Only return translations that maintain semantic similarity
        if similarity >= similarity_threshold:
            return back_translated, label, par_id, idx, similarity, True
        else:
            print(f"Low similarity ({similarity:.3f}) for item {idx}: discarded")
            return text, label, par_id, idx, similarity, False  # Return original text but mark as not augmented

    except Exception as e:
        print(f"Error in item {idx}: {str(e)}")
        return None, None, None, idx, 0.0, False

def back_translate_batch(texts, labels, par_ids, source='en', target='zh-CN', max_workers=5, similarity_threshold=0.75):
    """Process texts in parallel batches with similarity filtering"""
    results = [None] * len(texts)
    labels_out = [None] * len(labels)
    par_ids_out = [None] * len(par_ids)
    similarities = [0.0] * len(texts)
    is_augmented = [False] * len(texts)

    # Create work items
    work_items = [(texts[i], labels[i], par_ids[i], source, target, i, similarity_threshold) for i in range(len(texts))]

    # Process in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(back_translate_single, item) for item in work_items]

        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            result, label, par_id, idx, similarity, augmented = future.result()
            if result is not None:
                results[idx] = result
                labels_out[idx] = label
                par_ids_out[idx] = par_id
                similarities[idx] = similarity
                is_augmented[idx] = augmented

    # Create DataFrame with results
    result_df = pd.DataFrame({
        'par_id': par_ids_out,
        'original_text': texts,
        'text': results,
        'pcl_label': [int(x) if x is not None else None for x in labels_out],
        'similarity': similarities,
        'is_augmented': is_augmented
    })

    # Filter out None values
    result_df = result_df.dropna(subset=['text'])

    return result_df

# Main processing loop
language_list = ['zh-CN', 'fr', 'ru']
similarity_threshold = 0.75  # Adjust as needed

for language in language_list:
    output_file = f'data/backtrans_data_{language}.csv'

    if not os.path.exists(output_file):
        print(f"Processing language: {language}")

        # Process in chunks to avoid memory issues
        chunk_size = 100
        all_results_df = pd.DataFrame()

        for i in range(0, len(train_df), chunk_size):
            chunk_texts = train_df['text'].iloc[i:i+chunk_size].tolist()
            chunk_labels = train_df['pcl_label'].iloc[i:i+chunk_size].tolist()
            chunk_par_ids = train_df['par_id'].iloc[i:i+chunk_size].tolist()

            print(f"Processing chunk {i//chunk_size + 1}/{len(train_df)//chunk_size + 1}")
            result_df = back_translate_batch(
                chunk_texts,
                chunk_labels,
                chunk_par_ids,
                source='en',
                target=language,
                max_workers=5,
                similarity_threshold=similarity_threshold
            )

            all_results_df = pd.concat([all_results_df, result_df])

            # Save intermediate results
            all_results_df.to_csv(f'data/backtrans_temp_{language}.csv', index=False)

            # Optional: Add a delay between chunks
            time.sleep(2)

        # Save final results
        all_results_df.to_csv(output_file, index=False)

        # Print statistics
        total = len(all_results_df)
        augmented = all_results_df['is_augmented'].sum()
        print(f"Completed {language}: {total} samples processed")
        print(f"Kept {augmented} samples ({augmented/total:.1%}) with similarity ≥ {similarity_threshold}")
        print(f"Average similarity: {all_results_df['similarity'].mean():.3f}")

### Hyperparameters

In [None]:
batch_size = 16
lr = 1e-5
n_epochs = 2
betas = (0.9, 0.98)
eps = 1e-6
wd = 1e-2

In [None]:
class PCLDataset(Dataset):
    def __init__(self, dataframe, tokenizer, balance_method='oversample', seed=seed):
        self.tokenizer = tokenizer

         # Split into positive and negative classes
        pos_df = dataframe[dataframe['pcl_label'] == 1]
        neg_df = dataframe[dataframe['pcl_label'] == 0]

        # Balance classes
        if balance_method == 'oversample':
            # Repeat minority class samples
            if len(pos_df) > len(neg_df):
                pos_df, neg_df = neg_df, pos_df
            n_samples = max(len(pos_df), len(neg_df))
            pos_df = pos_df.sample(n_samples, replace=True, random_state=seed)
        elif balance_method == 'undersample':
            # Take minimum number of samples
            n_samples = min(len(pos_df), len(neg_df))
            pos_df = pos_df.sample(n_samples, random_state=seed)
            neg_df = neg_df.sample(n_samples, random_state=seed)
        elif balance_method == 'None':
            pass

        # Combine and shuffle
        balanced_df = pd.concat([pos_df, neg_df]).sample(frac=1, random_state=seed)
        self.texts = balanced_df['text'].tolist()
        self.labels = balanced_df['pcl_label'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and datasets
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

# Load all backtranslation files and combine them
backtrans_files = [
    'data/backtrans_data_de.csv',
    # 'data/backtrans_data_es.csv',
    'data/backtrans_data_fr.csv',
    # 'data/backtrans_data_ru.csv',
    'data/backtrans_data_zh-CN.csv'
]

backtrans_dfs = []
for file in backtrans_files:
    try:
        cur_df = pd.read_csv(file)
        backtrans_dfs.append(cur_df)
        print(f"Loaded {file} with {len(df)} rows")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# Combine all backtranslation dataframes
if backtrans_dfs:
    backtrans_df = pd.concat(backtrans_dfs, ignore_index=True)
    print(f"Combined backtranslation data: {len(backtrans_df)} rows")
else:
    backtrans_df = pd.DataFrame()
    print("No backtranslation data found")

# Create datasets
for col in train_df.columns:
    if col not in backtrans_df.columns:
        backtrans_df[col] = None

backtrans_df = backtrans_df[train_df.columns]
backtrans_df['pcl_label'] = backtrans_df['pcl_label'].astype(int)
augmented_train_df = pd.concat([backtrans_df, train_df], ignore_index=True)
print(augmented_train_df.head())
train_dataset = PCLDataset(augmented_train_df, tokenizer)
val_dataset = PCLDataset(val_df, tokenizer, balance_method='None')
test_dataset = PCLDataset(test_df, tokenizer, balance_method='None')

## Weighted Random Sampler

In [None]:
class WeightedRandomSamplerTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.weights = torch.FloatTensor(self._get_weights())
        self.sampler = WeightedRandomSampler(self.weights, len(self.weights), replacement=True)

    def get_train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.args.train_batch_size, sampler=self.sampler, collate_fn=self.data_collator)

    def _get_weights(self):
        labels = np.array(self.train_dataset.labels)
        class_counts = np.bincount(labels)
        class_weights = 1.0 / np.sqrt(class_counts.astype(np.float32))
        weights = class_weights[labels]
        return weights

train_dataset = PCLDataset(augmented_train_df, tokenizer, 'None')
val_dataset = PCLDataset(val_df, tokenizer, balance_method='None')
test_dataset = PCLDataset(test_df, tokenizer, balance_method='None')

In [None]:
# reset_seeds()
# model_config = AutoConfig.from_pretrained("answerdotai/ModernBERT-base")
# model_config.mlp_dropout = 0.2
# model_config.num_labels = 2

# # Initialize model with classification head
# model = AutoModelForSequenceClassification.from_pretrained(
#     "answerdotai/ModernBERT-base",
#     num_labels=2,
# )
# model.train()
# Training setup
# device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
# print("Using device:", device)
# model.to(device)
# training_args = TrainingArguments(
#     seed=seed,
#     data_seed=seed,
#     dataloader_num_workers=0,
#     output_dir=f"ModernBERT_pcl_ft",
#     learning_rate=lr,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=n_epochs,
#     lr_scheduler_type="cosine",
#     optim="adamw_torch_fused",
#     adam_beta1=betas[0],
#     adam_beta2=betas[1],
#     adam_epsilon=eps,
#     # weight_decay=wd,
#     logging_strategy="epoch",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     bf16=True,
#     bf16_full_eval=True,
#     push_to_hub=False,
#     warmup_ratio=0.1,
#     full_determinism=True

# )

def compute_metrics(eval_pred):
    """Calculate classification metrics for Hugging Face Trainer"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# class MetricsCallback(TrainerCallback):
#     def __init__(self):
#         self.training_history = {"train": [], "eval": []}

#     def on_log(self, args, state, control, logs=None, **kwargs):
#         if logs is not None:
#             if "loss" in logs:  # Training logs
#                 self.training_history["train"].append(logs)
#             elif "eval_loss" in logs:  # Evaluation logs
#                 self.training_history["eval"].append(logs)



class MetricsCallback(TrainerCallback):
    def __init__(self):
        self.training_history = {"train": [], "eval": []}
        self.best_f1 = 0.0
        self.best_epoch = 0
        self.best_step = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        self.training_history["eval"].append(metrics)
        # Track best F1 score
        if metrics.get("eval_f1", 0) > self.best_f1:
            self.best_f1 = metrics.get("eval_f1", 0)
            self.best_epoch = state.epoch
            self.best_step = state.global_step

    def on_log(self, args, state, control, logs, **kwargs):
        if "loss" in logs:
            self.training_history["train"].append(logs)



def train_model_with_seed(seed, model_name, train_dataset, val_dataset, output_dir):
    reset_seeds(seed)

    # Create a unique output directory for this seed
    seed_output_dir = f"{output_dir}/seed_{seed}"
    os.makedirs(seed_output_dir, exist_ok=True)

    # Load model with seed-specific configuration
    config = AutoConfig.from_pretrained(model_name, num_labels=2)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    # Training arguments
    training_args = TrainingArguments(
      seed=seed,
      data_seed=seed,
      dataloader_num_workers=0,
      output_dir=seed_output_dir,
      learning_rate=lr,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=n_epochs,
      lr_scheduler_type="cosine",
      optim="adamw_torch_fused",
      adam_beta1=betas[0],
      adam_beta2=betas[1],
      adam_epsilon=eps,
      # weight_decay=wd,
      logging_strategy="epoch",
      eval_strategy="epoch",
      save_strategy="epoch",
      load_best_model_at_end=True,
      bf16=True,
      bf16_full_eval=True,
      push_to_hub=False,
      warmup_ratio=0.1,
      # full_determinism=True
    )

    # Initialize metrics callback
    metrics_callback = MetricsCallback()

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[metrics_callback],
    )

    # Train the model
    print(f"Training model with seed {seed}...")
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()

    # Save the best model path and its metrics
    best_checkpoint = os.path.join(seed_output_dir, f"checkpoint-{metrics_callback.best_step}")

    return {
        "seed": seed,
        "f1_score": metrics_callback.best_f1,
        "best_checkpoint": best_checkpoint,
        "eval_results": eval_results,
        "best_epoch": metrics_callback.best_epoch
    }

def predict_with_model(model_path, test_dataset, tokenizer=None):
    """Use Trainer.predict() to get predictions from a model checkpoint"""
    # Load model and tokenizer from checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Create a temporary trainer for prediction
    training_args = TrainingArguments(
        output_dir="./tmp_predict",
        per_device_eval_batch_size=32,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        processing_class=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )

    # Get predictions
    print(f"Getting predictions from model at {model_path}")
    predictions = trainer.predict(test_dataset)

    # Extract logits and convert to probabilities
    logits = predictions.predictions
    probs = F.softmax(torch.tensor(logits), dim=1).numpy()

    # Get class predictions (0 or 1)
    preds = np.argmax(logits, axis=1)

    return preds, probs

def ensemble_predict(model_paths, test_dataset, tokenizer=None):
    """Combine predictions from multiple models using majority voting"""
    all_predictions = []
    all_probabilities = []

    # Get predictions from each model
    for model_path in model_paths:
        preds, probs = predict_with_model(model_path, test_dataset, tokenizer)
        all_predictions.append(preds)
        all_probabilities.append(probs)

    # Stack predictions for voting
    stacked_preds = np.stack(all_predictions)

    # Majority voting (mode of predictions)
    voted_predictions = []
    for i in range(len(test_dataset)):
        votes = stacked_preds[:, i]
        # Find most common prediction (0 or 1)
        voted_pred = Counter(votes).most_common(1)[0][0]
        voted_predictions.append(voted_pred)

    # Average probabilities
    avg_probs = np.mean(np.stack(all_probabilities), axis=0)

    return np.array(voted_predictions), avg_probs

def train_ensemble(model_name, train_dataset, val_dataset, test_dataset, num_models=10, output_dir=f"ModernBERT_pcl_ft"):
    """Train multiple models with different seeds and create an ensemble"""
    os.makedirs(output_dir, exist_ok=True)

    # Train multiple models with different seeds
    model_results = []
    seeds = list(range(42, 42 + num_models))  # Use seeds 42 to 51

    for seed in seeds:
        result = train_model_with_seed(seed, model_name, train_dataset, val_dataset, output_dir)
        model_results.append(result)
        print(f"Seed {seed} - F1 Score: {result['f1_score']:.4f}")

    # Sort models by validation F1 score and select top 3
    model_results.sort(key=lambda x: x['f1_score'], reverse=True)
    top_models = model_results[:3]

    print("\nTop 3 models:")
    for i, model in enumerate(top_models):
        print(f"{i+1}. Seed {model['seed']} - F1 Score: {model['f1_score']:.4f}")

    # Get the checkpoint paths for the top models
    top_model_paths = [model["best_checkpoint"] for model in top_models]

    # Save ensemble metadata
    ensemble_info = {
        "models": [{"seed": m["seed"], "checkpoint": m["best_checkpoint"], "f1_score": m["f1_score"]} for m in top_models],
        "creation_date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    pd.DataFrame(ensemble_info["models"]).to_csv(f"{output_dir}/ensemble_models.csv", index=False)

    # Evaluate ensemble on test set
    print("\nEvaluating ensemble on test set...")
    ensemble_preds, ensemble_probs = ensemble_predict(top_model_paths, test_dataset)

    # Get true labels from test dataset
    true_labels = [item['labels'].item() for item in test_dataset]

    # Calculate metrics
    accuracy = accuracy_score(true_labels, ensemble_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average='binary')

    print(f"Ensemble Test Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

    # Compare with individual model performance
    print("\nComparing with individual model performance:")
    for i, model_path in enumerate(top_model_paths):
        model_preds, _ = predict_with_model(model_path, test_dataset)
        model_accuracy = accuracy_score(true_labels, model_preds)
        model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(true_labels, model_preds, average='binary')

        print(f"Model {i+1} (Seed {top_models[i]['seed']}):")
        print(f"  Accuracy: {model_accuracy:.4f}")
        print(f"  F1 Score: {model_f1:.4f}")
        print(f"  Precision: {model_precision:.4f}")
        print(f"  Recall: {model_recall:.4f}")

    # Create a prediction function for future use
    def predict_with_ensemble(new_dataset):
        """Function to make predictions with the ensemble on new data"""
        return ensemble_predict(top_model_paths, new_dataset)

    return top_model_paths, ensemble_preds, ensemble_probs, predict_with_ensemble














# trainer = WeightedRandomSamplerTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     processing_class=tokenizer,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
#     compute_metrics=compute_metrics,
# )

# metrics_callback = MetricsCallback()
# trainer.add_callback(metrics_callback)

# trainer.train()

# train_history_df = pd.DataFrame(metrics_callback.training_history["train"])
# train_history_df = train_history_df.add_prefix("train_")
# eval_history_df = pd.DataFrame(metrics_callback.training_history["eval"])
# train_res_df = pd.concat([train_history_df, eval_history_df], axis=1)

# args_df = pd.DataFrame([training_args.to_dict()])

# display(train_res_df)
# display(args_df)

In [139]:
# Define your model name
model_name = "answerdotai/ModernBERT-base"  # or your preferred model

# Train the ensemble
top_model_paths, ensemble_preds, ensemble_probs, predict_fn = train_ensemble(
    model_name=model_name,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    num_models=10,
)

All random states have been reset with seed 42


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model with seed 42...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2322,0.2108,0.929552,0.593103,0.710744,0.508876
2,0.0827,0.364934,0.922985,0.571429,0.651515,0.508876


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Seed 42 - F1 Score: 0.5931
All random states have been reset with seed 43


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model with seed 43...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2228,0.240387,0.92597,0.495935,0.792208,0.360947
2,0.0861,0.376582,0.922388,0.551724,0.661157,0.473373


Seed 43 - F1 Score: 0.5517
All random states have been reset with seed 44


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model with seed 44...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2224,0.263775,0.924776,0.483607,0.786667,0.349112
2,0.0779,0.363577,0.926567,0.571429,0.694915,0.485207


Seed 44 - F1 Score: 0.5714
All random states have been reset with seed 45


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model with seed 45...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2133,0.312481,0.921194,0.410714,0.836364,0.272189
2,0.0779,0.376597,0.918806,0.524476,0.641026,0.443787


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Seed 45 - F1 Score: 0.5245
All random states have been reset with seed 46


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model with seed 46...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2332,0.264683,0.918806,0.546667,0.625954,0.485207
2,0.0953,0.36071,0.920597,0.552189,0.640625,0.485207


RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Inference

In [90]:
import torch.nn.functional as F

checkpoint_path = "./ModernBERT_pcl_ft/checkpoint-1676"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)


# Evaluation on a single example
def predict_single(text: str, model, tokenizer, device='cuda'):
    model.to(device)
    model.eval()

    encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    return F.softmax(logits).cpu().numpy()

print(df.head())

test_input = df[df['label'] == 3]['text'].iloc[3]
predict_single(test_input, model, tokenizer)

   par_id      art_id      topic country  \
0       1  @@24942188   hopeless      ph   
1       2  @@21968160    migrant      gh   
2       3  @@16584954  immigrant      ie   
3       4   @@7811231   disabled      nz   
4       5   @@1494111    refugee      ca   

                                                text  label  
0  We 're living in times of absolute insanity , ...      0  
1  In Libya today , there are countless number of...      0  
2  White House press secretary Sean Spicer said t...      0  
3  Council customers only signs would be displaye...      0  
4  " Just like we received migrants fleeing El Sa...      0  


  return F.softmax(logits).cpu().numpy()


array([[0.00977466, 0.9902254 ]], dtype=float32)

In [87]:
# from huggingface_hub import notebook_login

# notebook_login()

In [88]:
# model_name = "Hasasasaki/modernBERT_pcl_ft"
# model.push_to_hub(model_name)
# tokenizer.push_to_hub(model_name)

In [91]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
trainer.evaluate(test_dataset)

{'eval_loss': 0.34968075156211853,
 'eval_model_preparation_time': 0.0058,
 'eval_accuracy': 0.9144768275203058,
 'eval_f1': 0.535064935064935,
 'eval_precision': 0.553763440860215,
 'eval_recall': 0.5175879396984925,
 'eval_runtime': 7.1778,
 'eval_samples_per_second': 291.592,
 'eval_steps_per_second': 18.251}