In [None]:
!pip install transformers datasets wandb scikit-learn imbalanced-learn nlpaug accelerate
# from google.colab import drive
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer,  TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset

import os
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import wandb
import zipfile

torch.manual_seed(42)
np.random.seed(42)

In [None]:
# drive.mount('/content/drive')

In [None]:
from datetime import datetime
import os

# Get current date in YYYY-MM-DD format
CURRENT_DATE = datetime.now().strftime("%Y-%m-%d_%H-%M")


In [None]:
!wandb login

In [None]:
dataset_path = '/content/drive/MyDrive/polarization_dataset.zip'
extract_path = '/content/drive/MyDrive/polarization_dataset/'

if os.path.exists(extract_path) and len(os.listdir(extract_path)) > 0:
    print("Dataset already exists and is extracted. Skipping download.")
else:
    print("Dataset not found. Downloading...")
    !wget -O "{dataset_path}" "https://www.codabench.org/datasets/download/1c1791a1-a41b-4895-a636-49fb7234cb48/"
    print("Download complete. Extracting...")

    os.makedirs(extract_path, exist_ok=True)

    with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    print(f"Dataset extracted to: {extract_path}")

# SUBTASK 1

In [None]:
print("Contents of subtask1:")
# Load train.csv or similar
train_path = 'dataset/subtask1/train/eng.csv'  # adjust if needed
# train_path = '/content/drive/MyDrive/polarization_dataset/subtask1/train/eng.csv'  # adjust if needed
if os.path.exists(train_path):
    df = pd.read_csv(train_path)
    print(f"Loaded {len(df)} samples from {train_path}")
    print(df.head())
else:
    print("Train file not found, listing all files:")
    !find polarization_dataset/ -name "*.csv"

In [None]:
# Text length analysis
df['text_length'] = df['text'].apply(len)

print("\nTrain text length stats:")
print(df['text_length'].describe())


In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(df['text_length'], ax=axes[0], kde=True)
axes[0].set_title('Train Text Length Distribution')
plt.show()


The above informs the choice of the "max_number" parameter passed to the tokenizer ( If most sentences are around 50 words, using max_length=128 is safe and usually more than enough for a Transformer model. )

In [None]:
def compare_polarized_vs_unpolarized(dataframe = None, language = "eng", subtask_id = 1):
  if dataframe is None:
    if language not in ['eng', 'swa']:
      raise ValueError
    if subtask_id not in [1,2,3]:
      raise ValueError
    dev_path = f"dataset/subtask{subtask_id}/train/{language}" + ".csv"
    if os.path.exists(dev_path):
        df = pd.read_csv(dev_path)
        print(f"Dev set: {len(df)} samples")
        print(df.head())
    else:
        print("Dev file not found")

    # Class distribution
    print("\nTrain class distribution:")
    print(df['polarization'].value_counts(normalize=True))
  else:
    df = dataframe
    print(f"Dev set: {len(df)} samples")
    # print(df.head())



  # Class balance plot for train
  plt.figure(figsize=(6, 4))
  df['polarization'].value_counts().plot(kind='bar')
  plt.title('Train Polarization Distribution')
  plt.show()

In [None]:
compare_polarized_vs_unpolarized(language="eng")

In [None]:
!pip install emoji

In [None]:
run_count = 0


In [None]:
# model_name = 'Twitter/twhin-bert-base'#@param
# model_name = 'distilbert/distilbert-base-multilingual-cased'#@param
# model_name = 'microsoft/mdeberta-v3-base'#@param
# model_name = 'metabloit/swahBERT'#@param
# model_name="castorini/afriberta_large"
model_name="Davlan/afro-xlmr-large"
hidden_dropout_prob = 0.3
attention_probs_dropout_prob = 0.3
number_of_tokens = 128

heading = "Training"#@param
problem_type = "binary_classification"#@param
# model_name = "Davlan/afro-xlmr-base"#@param
desc=""#@param


learning_rate= 2e-5 #@param
weight_decay=0.01 #@param
# number_of_epochs = 7#@param
number_of_epochs = 11#@param
per_device_train_batch_size= 16#@param
greater_is_better=True #@param
gradient_accumulation_steps=2 #@param
warmup_steps=100#@param
max_grad_norm=1.0#@param
label_smoothing_factor=0.1#@param
early_stopping_patience=5#@param
lr_scheduler_type="cosine"#@param


# distilbert/distilbert-base-uncased
# model_name = 'microsoft/deberta-v3-base'#@param


In [None]:
import re
import emoji

def clean_text(text):
    # Convert emojis to text descriptions (spaces instead of colons)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()

    return text

df['text'] = df['text'].apply(clean_text)

# Split train into train and val
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['polarization'], random_state=42)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

# Create datasets
train_dataset = Dataset.from_pandas(train_df[['text', 'polarization']])
val_dataset = Dataset.from_pandas(val_df[['text', 'polarization']])

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=number_of_tokens)
    tokenized['labels'] = examples['polarization']
    return tokenized

# Tokenize
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

print("Preprocessing complete.")

In [None]:
dataset = ["eng", "swa", "eng + swa"]
dataset_type = dataset[0]

In [None]:
notes = f'''
====================================
Date reported: {CURRENT_DATE}
====================================

{heading}
{desc}
Tokenized data for {dataset_type} using {model_name} tokenizer:
- Max length: {number_of_tokens} tokens (with truncation and padding to max_length)
- Labels formatted as list of tuples for multilabel setup

Datasets prepared for PyTorch:
- train_dataset_swa: input_ids, attention_mask, labels
- val_dataset_swa: input_ids, attention_mask, labels

Model initialized:
- Base: {model_name}
- num_labels: 5 (political, racial/ethnic, religious, gender/sexual, other)
- learning_rate: {learning_rate}
- weight_decay: {weight_decay}
- epoch: {number_of_epochs}
- hidden_dropout_prob: {hidden_dropout_prob}
- attention_probs_dropout_prob: {attention_probs_dropout_prob}
- problem_type: {problem_type}
- greater_is_better: {greater_is_better}
- gradient_accumulation_steps: {gradient_accumulation_steps}
- warmup_steps: {warmup_steps}
- max_grad_norm: {max_grad_norm}
- early_stopping_patience = {early_stopping_patience}
- label_smoothing_factor: {label_smoothing_factor}
- lr_scheduler_type: {lr_scheduler_type}

Tokenizer and model ready for {dataset_type} subtask 2 training.
Train size: {len(train_df)}, Val size: {len(val_df)} '''


In [None]:
def run_model():
    
    # if 'df' in locals():
        # run_count_for_wandb_swa += 
    
    # Get current date in YYYY-MM-DD format
        CURRENT_DATE = datetime.now().strftime("%Y-%m-%d_%H-%M")
        wandb.init(project=f'subtask1', name=f'{model_name}{CURRENT_DATE}{dataset_type}', notes=notes)
        config = AutoConfig.from_pretrained(model_name)
        config.hidden_dropout_prob = hidden_dropout_prob
        config.attention_probs_dropout_prob = attention_probs_dropout_prob
        config.num_labels = 2
        model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                   config = config
                                                                  )
    
    
        # Class weights
        class_weights = compute_class_weight('balanced', classes=np.unique(train_df['polarization']), y=train_df['polarization'])
        class_weights = torch.tensor(class_weights, dtype=torch.float)
        print(f"Class weights: {class_weights}")
    
            # Compute metrics
        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
            f1 = f1_score(labels, predictions, average='macro')
            return {'f1': f1}
    
    
        training_args = TrainingArguments(
            output_dir='output',
            num_train_epochs=number_of_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=8,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            logging_steps=10,
            eval_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model='eval_f1',
            report_to='wandb',
            seed=42,
            greater_is_better=True,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=warmup_steps,
            max_grad_norm=max_grad_norm,
            label_smoothing_factor=label_smoothing_factor,
            lr_scheduler_type=lr_scheduler_type,
        )
    
        trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
    )
    
        trainer.train()
        trainer.save_model(f'models/{model_name.split("/")[-1]}')
    
        print("training complete.")
        wandb.finish()

In [None]:
run_model()

In [None]:

# model_name = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load best model (assuming it was saved during training)
model = AutoModelForSequenceClassification.from_pretrained(f'models/{model_name.split('/')[-1]}')

# Set model to eval mode
model.eval()


In [None]:
dev_path = 'dataset/subtask1/dev/eng.csv'
df_dev = pd.read_csv(dev_path)
df_dev['text'] = df_dev['text'].apply(clean_text)
print(df_dev.head())

In [None]:

# Create dataset
dev_dataset = Dataset.from_pandas(df_dev[['text']])
dev_dataset = dev_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Predict
from torch.utils.data import DataLoader
dataloader = DataLoader(dev_dataset, batch_size=16)

predictions = []
with torch.no_grad():
    for batch in dataloader:
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Create submission
submission = df_dev[['id']].copy()
submission['polarization'] = predictions
submission.to_csv('submission/pred_eng.csv', index=False)

print("Submission generated and saved to submission.csv")
print(f"Predicted class distribution: {pd.Series(predictions).value_counts()}")
print(submission.head())