<a href="https://colab.research.google.com/github/Hasasasaki/semeval_2022_task_4/blob/main/model_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'


In [48]:
# !pip install "flash_attn==2.6.3" --no-build-isolation
!pip install deep_translator



## Data processing

In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    TrainerCallback,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import random
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch.nn as nn

seed = 42

def set_seed(seed=seed):
    """Set all seeds to make results reproducible"""
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()

new_columns = [
    "par_id",      # 1 (integer ID)
    "art_id",      # @@24942188 (article identifier)
    "topic",       # hopeless (PCL category)
    "country",     # ph (country code)
    "text",        # Full text content
    "label"        # 0 (binary label)
]

# Read main dataset - skip 4 disclaimer rows
df = pd.read_csv(
    "data/dontpatronizeme_pcl.tsv",
    sep="\t",
    header=None,
    skiprows=4,
    names=new_columns,
    on_bad_lines='warn'
)

# Read train/dev splits
train_val_labels = pd.read_csv("data/train_semeval_parids-labels.csv")
test_labels = pd.read_csv("data/dev_semeval_parids-labels.csv")

# Convert string labels to lists
def parse_labels(label_str: str) -> list[int]:
    return [int(x) for x in label_str.strip("[]").replace(" ", "").split(",")]

# Process labels dataframes
for labels_df in [train_val_labels, test_labels]:
    labels_df['labels'] = labels_df['label'].apply(parse_labels)
    labels_df.drop('label', axis=1, inplace=True)

# Join with main data
train_val_df = df.merge(train_val_labels, on="par_id", how="inner")
test_df = df.merge(test_labels, on="par_id", how="inner")

# Add PCL positivity column to both dataframes
train_val_df['pcl_label'] = train_val_df['label'].apply(
    lambda x: 0 if x in {0, 1} else 1)
test_df['pcl_label'] = test_df['label'].apply(
    lambda x: 0 if x in {0, 1} else 1)

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=seed, shuffle=True)

In [50]:
import nltk
# os.environ["TRANSFORMERS_CACHE"] = "/vol/bitbucket/bj321/.cache"
# nltk.data.path.append("/vol/bitbucket/bj321/nltk_data")  # Your custom path
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [51]:
import gc
torch.cuda.empty_cache()
gc.collect()

5779

In [52]:
from deep_translator import GoogleTranslator
import concurrent.futures
import time
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Load SentenceBERT model
print("Loading SentenceBERT model...")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Smaller, faster model
# Alternative: model = SentenceTransformer('paraphrase-mpnet-base-v2')  # More accurate but slower

def compute_similarity(original, translated):
    """Compute cosine similarity between original and translated text embeddings"""
    # Get embeddings
    emb1 = model.encode([original])[0]
    emb2 = model.encode([translated])[0]

    # Compute cosine similarity (1 - cosine distance)
    similarity = 1 - cosine(emb1, emb2)
    return similarity

def back_translate_single(item):
    """Process a single text item with similarity filtering"""
    text, label, par_id, source, target, idx, similarity_threshold = item
    try:
        # First translation (source to target)
        translated = GoogleTranslator(source=source, target=target).translate(text)
        time.sleep(0.5)  # Avoid rate limiting

        # Second translation (target back to source)
        back_translated = GoogleTranslator(source=target, target=source).translate(translated)

        # Compute semantic similarity
        similarity = compute_similarity(text, back_translated)

        # Only return translations that maintain semantic similarity
        if similarity >= similarity_threshold:
            return back_translated, label, par_id, idx, similarity, True
        else:
            print(f"Low similarity ({similarity:.3f}) for item {idx}: discarded")
            return text, label, par_id, idx, similarity, False  # Return original text but mark as not augmented

    except Exception as e:
        print(f"Error in item {idx}: {str(e)}")
        return None, None, None, idx, 0.0, False

def back_translate_batch(texts, labels, par_ids, source='en', target='zh-CN', max_workers=5, similarity_threshold=0.75):
    """Process texts in parallel batches with similarity filtering"""
    results = [None] * len(texts)
    labels_out = [None] * len(labels)
    par_ids_out = [None] * len(par_ids)
    similarities = [0.0] * len(texts)
    is_augmented = [False] * len(texts)

    # Create work items
    work_items = [(texts[i], labels[i], par_ids[i], source, target, i, similarity_threshold) for i in range(len(texts))]

    # Process in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(back_translate_single, item) for item in work_items]

        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            result, label, par_id, idx, similarity, augmented = future.result()
            if result is not None:
                results[idx] = result
                labels_out[idx] = label
                par_ids_out[idx] = par_id
                similarities[idx] = similarity
                is_augmented[idx] = augmented

    # Create DataFrame with results
    result_df = pd.DataFrame({
        'par_id': par_ids_out,
        'original_text': texts,
        'text': results,
        'pcl_label': [int(x) for x in labels_out if x is not None else None],
        'similarity': similarities,
        'is_augmented': is_augmented
    })

    # Filter out None values
    result_df = result_df.dropna(subset=['text'])

    return result_df

# Main processing loop
language_list = ['zh-CN', 'fr', 'de', 'es', 'ru']
similarity_threshold = 0.75  # Adjust as needed

for language in language_list:
    output_file = f'data/backtrans_data_{language}.csv'

    if not os.path.exists(output_file):
        print(f"Processing language: {language}")

        # Process in chunks to avoid memory issues
        chunk_size = 100
        all_results_df = pd.DataFrame()

        for i in range(0, len(train_df), chunk_size):
            chunk_texts = train_df['text'].iloc[i:i+chunk_size].tolist()
            chunk_labels = train_df['pcl_label'].iloc[i:i+chunk_size].tolist()
            chunk_par_ids = train_df['par_id'].iloc[i:i+chunk_size].tolist()

            print(f"Processing chunk {i//chunk_size + 1}/{len(train_df)//chunk_size + 1}")
            result_df = back_translate_batch(
                chunk_texts,
                chunk_labels,
                chunk_par_ids,
                source='en',
                target=language,
                max_workers=5,
                similarity_threshold=similarity_threshold
            )

            all_results_df = pd.concat([all_results_df, result_df])

            # Save intermediate results
            all_results_df.to_csv(f'data/backtrans_temp_{language}.csv', index=False)

            # Optional: Add a delay between chunks
            time.sleep(2)

        # Save final results
        all_results_df.to_csv(output_file, index=False)

        # Print statistics
        total = len(all_results_df)
        augmented = all_results_df['is_augmented'].sum()
        print(f"Completed {language}: {total} samples processed")
        print(f"Kept {augmented} samples ({augmented/total:.1%}) with similarity ≥ {similarity_threshold}")
        print(f"Average similarity: {all_results_df['similarity'].mean():.3f}")

Loading SentenceBERT model...


### Hyperparameters

In [53]:

batch_size = 16
lr = 8e-5
n_epochs = 2
betas = (0.9, 0.98)
eps = 1e-6
wd = 1e-2

In [54]:
class PCLDataset(Dataset):
    def __init__(self, dataframe, tokenizer, balance_method='oversample', seed=seed):
        self.tokenizer = tokenizer

         # Split into positive and negative classes
        pos_df = dataframe[dataframe['pcl_label'] == 1]
        neg_df = dataframe[dataframe['pcl_label'] == 0]

        # Balance classes
        if balance_method == 'oversample':
            # Repeat minority class samples
            if len(pos_df) > len(neg_df):
                pos_df, neg_df = neg_df, pos_df
            n_samples = max(len(pos_df), len(neg_df))
            pos_df = pos_df.sample(n_samples, replace=True, random_state=seed)
        elif balance_method == 'undersample':
            # Take minimum number of samples
            n_samples = min(len(pos_df), len(neg_df))
            pos_df = pos_df.sample(n_samples, random_state=seed)
            neg_df = neg_df.sample(n_samples, random_state=seed)
        elif balance_method == 'None':
            pass

        # Combine and shuffle
        balanced_df = pd.concat([pos_df, neg_df]).sample(frac=1, random_state=seed)
        self.texts = balanced_df['text'].tolist()
        self.labels = balanced_df['pcl_label'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and datasets
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

# Load all backtranslation files and combine them
backtrans_files = [
    'data/backtrans_data_de.csv',
    'data/backtrans_data_es.csv',
    'data/backtrans_data_fr.csv',
    # 'data/backtrans_data_ru.csv',
    'data/backtrans_data_zh-CN.csv'
]

backtrans_dfs = []
for file in backtrans_files:
    try:
        cur_df = pd.read_csv(file)
        backtrans_dfs.append(cur_df)
        print(f"Loaded {file} with {len(df)} rows")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# Combine all backtranslation dataframes
if backtrans_dfs:
    backtrans_df = pd.concat(backtrans_dfs, ignore_index=True)
    print(f"Combined backtranslation data: {len(backtrans_df)} rows")
else:
    backtrans_df = pd.DataFrame()
    print("No backtranslation data found")

# Create datasets
for col in train_df.columns:
    if col not in backtrans_df.columns:
        backtrans_df[col] = None

backtrans_df = backtrans_df[train_df.columns]
backtrans_df['pcl_label'] = backtrans_df['pcl_label'].astype(int)
augmented_train_df = pd.concat([backtrans_df, train_df], ignore_index=True)
print(augmented_train_df.head())
train_dataset = PCLDataset(augmented_train_df, tokenizer)
val_dataset = PCLDataset(val_df, tokenizer, balance_method='None')
test_dataset = PCLDataset(test_df, tokenizer, balance_method='None')

Loaded data/backtrans_data_de.csv with 10468 rows
Loaded data/backtrans_data_es.csv with 10468 rows
Loaded data/backtrans_data_fr.csv with 10468 rows
Loaded data/backtrans_data_zh-CN.csv with 10468 rows
Combined backtranslation data: 21793 rows
   par_id art_id topic country  \
0     NaN   None  None    None   
1     NaN   None  None    None   
2     NaN   None  None    None   
3     NaN   None  None    None   
4     NaN   None  None    None   

                                                text label labels  pcl_label  
0  It described the local police as under resourc...  None   None          0  
1  The only force that is able to stop it is the ...  None   None          1  
2  The government's plans to return to mass finan...  None   None          0  
3  New figures show that more than 48,000 Rohingy...  None   None          0  
4  He then listed several immigrants, mainly from...  None   None          0  


## Weighted Random Sampler

In [55]:
class WeightedRandomSamplerTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.weights = torch.FloatTensor(self._get_weights())
        self.sampler = WeightedRandomSampler(self.weights, len(self.weights), replacement=True)

    def get_train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.args.train_batch_size, sampler=self.sampler, collate_fn=self.data_collator)

    def _get_weights(self):
        labels = np.array(self.train_dataset.labels)
        class_counts = np.bincount(labels)
        class_weights = 1.0 / np.sqrt(class_counts.astype(np.float32))
        weights = class_weights[labels]
        return weights

train_dataset = PCLDataset(augmented_train_df, tokenizer, 'None')
val_dataset = PCLDataset(val_df, tokenizer, balance_method='None')
test_dataset = PCLDataset(test_df, tokenizer, balance_method='None')

In [56]:
model_config = AutoConfig.from_pretrained("answerdotai/ModernBERT-base")
model_config.mlp_dropout = 0.2
model_config.num_labels = 2

# Initialize model with classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base",
    num_labels=2,
)
model.train()
# Training setup
# device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
# print("Using device:", device)
# model.to(device)
training_args = TrainingArguments(
    seed=seed,
    data_seed=seed,
    dataloader_num_workers=0,
    output_dir=f"ModernBERT_pcl_ft",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=n_epochs,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    adam_beta1=betas[0],
    adam_beta2=betas[1],
    adam_epsilon=eps,
    # weight_decay=wd,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    bf16=True,
    bf16_full_eval=True,
    push_to_hub=False,
    warmup_ratio=0.1,
    full_determinism=True

)

def compute_metrics(eval_pred):
    """Calculate classification metrics for Hugging Face Trainer"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

class MetricsCallback(TrainerCallback):
    def __init__(self):
        self.training_history = {"train": [], "eval": []}

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:  # Training logs
                self.training_history["train"].append(logs)
            elif "eval_loss" in logs:  # Evaluation logs
                self.training_history["eval"].append(logs)

trainer = WeightedRandomSamplerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

metrics_callback = MetricsCallback()
trainer.add_callback(metrics_callback)

trainer.train()

train_history_df = pd.DataFrame(metrics_callback.training_history["train"])
train_history_df = train_history_df.add_prefix("train_")
eval_history_df = pd.DataFrame(metrics_callback.training_history["eval"])
train_res_df = pd.concat([train_history_df, eval_history_df], axis=1)

args_df = pd.DataFrame([training_args.to_dict()])

display(train_res_df)
display(args_df)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2057,0.469438,0.916418,0.530201,0.612403,0.467456
2,0.0155,0.783807,0.92,0.446281,0.739726,0.319527


Unnamed: 0,train_loss,train_grad_norm,train_learning_rate,train_epoch,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall
0,0.2057,0.705695,4.7e-05,1.0,0.469438,0.916418,0.530201,0.612403,0.467456
1,0.0155,0.000129,0.0,2.0,0.783807,0.92,0.446281,0.739726,0.319527


Unnamed: 0,output_dir,overwrite_output_dir,do_train,do_eval,do_predict,eval_strategy,prediction_loss_only,per_device_train_batch_size,per_device_eval_batch_size,per_gpu_train_batch_size,...,split_batches,include_tokens_per_second,include_num_input_tokens_seen,neftune_noise_alpha,optim_target_modules,batch_eval_metrics,eval_on_start,use_liger_kernel,eval_use_gather_object,average_tokens_across_devices
0,ModernBERT_pcl_ft,False,False,True,False,epoch,False,16,16,,...,,False,False,,,False,False,False,False,False


## Inference

In [57]:
import torch.nn.functional as F

checkpoint_path = "./ModernBERT_pcl_ft/checkpoint-1781"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)


# Evaluation on a single example
def predict_single(text: str, model, tokenizer, device='cuda'):
    model.to(device)
    model.eval()

    encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    return F.softmax(logits).cpu().numpy()

print(df.head())

test_input = df[df['label'] == 3]['text'].iloc[3]
predict_single(test_input, model, tokenizer)

   par_id      art_id      topic country  \
0       1  @@24942188   hopeless      ph   
1       2  @@21968160    migrant      gh   
2       3  @@16584954  immigrant      ie   
3       4   @@7811231   disabled      nz   
4       5   @@1494111    refugee      ca   

                                                text  label  
0  We 're living in times of absolute insanity , ...      0  
1  In Libya today , there are countless number of...      0  
2  White House press secretary Sean Spicer said t...      0  
3  Council customers only signs would be displaye...      0  
4  " Just like we received migrants fleeing El Sa...      0  


  return F.softmax(logits).cpu().numpy()


array([[3.7578843e-04, 9.9962413e-01]], dtype=float32)

In [58]:
# from huggingface_hub import notebook_login

# notebook_login()

In [59]:
# model_name = "Hasasasaki/modernBERT_pcl_ft"
# model.push_to_hub(model_name)
# tokenizer.push_to_hub(model_name)

In [60]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
trainer.evaluate(test_dataset)

{'eval_loss': 0.5408523082733154,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.9120879120879121,
 'eval_f1': 0.5157894736842106,
 'eval_precision': 0.5414364640883977,
 'eval_recall': 0.49246231155778897,
 'eval_runtime': 6.9616,
 'eval_samples_per_second': 300.648,
 'eval_steps_per_second': 18.817}