In [1]:
# ============================================
# Mount Drive and Setup
# ============================================
from google.colab import drive
drive.mount('/content/drive')

!git clone https://github.com/PiotrNawrot/dynamic-pooling.git
%cd dynamic-pooling

!pip install torch transformers datasets numpy scikit-learn matplotlib
!pip install evaluate

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fatal: destination path 'dynamic-pooling' already exists and is not an empty directory.
/content/dynamic-pooling


In [2]:
# ============================================
# 0. Setup
# ============================================
!pip install --upgrade huggingface_hub
!pip install transformers==4.49.0
!pip install accelerate
!pip install peft==0.5.0
!pip install datasets
!pip install bitsandbytes==0.38.2
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install tqdm

import json
import pandas as pd
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import evaluate

# Set seed

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed()

[31mERROR: Could not find a version that satisfies the requirement bitsandbytes==0.38.2 (from versions: 0.31.8, 0.32.0, 0.32.1, 0.32.2, 0.32.3, 0.33.0, 0.33.1, 0.34.0, 0.35.0, 0.35.1, 0.35.2, 0.35.3, 0.35.4, 0.36.0, 0.36.0.post1, 0.36.0.post2, 0.37.0, 0.37.1, 0.37.2, 0.38.0, 0.38.0.post1, 0.38.0.post2, 0.38.1, 0.39.0, 0.39.1, 0.40.0, 0.40.0.post1, 0.40.0.post2, 0.40.0.post3, 0.40.0.post4, 0.40.1, 0.40.1.post1, 0.40.2, 0.41.0, 0.41.1, 0.41.2, 0.41.2.post1, 0.41.2.post2, 0.41.3, 0.41.3.post1, 0.41.3.post2, 0.42.0, 0.43.0, 0.43.1, 0.43.2, 0.43.3, 0.44.0rc1, 0.44.0, 0.44.1, 0.45.0, 0.45.1, 0.45.2, 0.45.3, 0.45.4)[0m[31m
[0m[31mERROR: No matching distribution found for bitsandbytes==0.38.2[0m[31m


In [3]:
# ============================================
# 1. Load Dataset
# ============================================
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

train_data = load_jsonl("/content/drive/MyDrive/SNLP Group Project/Datasets/AraStance/train.jsonl")
dev_data = load_jsonl("/content/drive/MyDrive/SNLP Group Project/Datasets/AraStance/dev.jsonl")
test_data = load_jsonl("/content/drive/MyDrive/SNLP Group Project/Datasets/AraStance/test.jsonl")

print("Sample train data entry:", train_data[0])

train_df = pd.DataFrame(train_data)
dev_df = pd.DataFrame(dev_data)
test_df = pd.DataFrame(test_data)

print("Train shape:", train_df.shape)

for df in [train_df, dev_df, test_df]:
    df['stance'] = df['stance'].astype(str)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(dev_df),
    "test": Dataset.from_pandas(test_df)
})

Sample train data entry: {'filename': 'first_batch/false2_611.json', 'claim': 'ظهور علم مصر على برج خليفة', 'claim_url': 'https://dabegad.com/%d8%ad%d9%82%d9%8a%d9%82%d8%a9-%d8%b8%d9%87%d9%88%d8%b1-%d8%b9%d9%84%d9%85-%d9%85%d8%b5%d8%b1-%d8%b9%d9%84%d9%89-%d8%a8%d8%b1%d8%ac-%d8%ae%d9%84%d9%8a%d9%81%d8%a9/', 'article': ['استمرّت الحضارة المصرية القديمة ثلاثة آلاف عام، حيث ترك ملوكها معالماً تختزل كمّاً هائلاً من المهارة والمعرفة في الهندسة المعمارية والفنون، فما زالت العديد من آثارهم قائمةً حتّى هذا اليوم وبحالة جيدة، فالعديد من الأهرامات والمعابد وتماثيل أبي الهول لا تزال تجذب السياح من جميع أنحاء العالم بعد مرور آلاف السنين على بنائها،[١] كما ارتبطت الأهرامات بالحضارة المصرية في أذهان الناس على الرغم من وجودها في العديد من الحضارات القديمة الأخرى؛ كالحضارة الصينية وحضارة المايا.[٢] بُنيت أهرامات الجيزة الثلاث على هضبة صخرية تقع على الضفة الغربية لنهر النيل بالقرب من الجيزة شمال مصر، وقد كانت هذه الأهرامات إحدى عجائب الدنيا السبعة القديمة، بالإضافة إلى أنّها اختيرت كموقع من مواقع التراث

In [4]:
# ============================================
# 2. Label Encoding
# ============================================
def clean_stance(x):
    if isinstance(x, list):
        return x[0] if len(x) > 0 else "Unknown"
    return x

dataset = dataset.map(lambda x: {"stance": clean_stance(x["stance"])})
LABELS = sorted(list(set(dataset["train"]["stance"])))
label2id = {label: i for i, label in enumerate(LABELS)}
id2label = {v: k for k, v in label2id.items()}
print("Labels:", LABELS)

def encode_labels(example):
    example["label"] = label2id.get(example["stance"], -1)
    return example

dataset = dataset.map(encode_labels).filter(lambda x: x["label"] != -1)

Map:   0%|          | 0/637 [00:00<?, ? examples/s]

Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/137 [00:00<?, ? examples/s]

Labels: ["['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Discuss']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Discuss']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Unrelated', 'Unrelated', 'Unrelated', 'Unrelated', 'Unrelated', 'Unrelated', 'Unrelated', 'Unrelated']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Unrelated']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Unrelated', 'Unrelated', 'Unrelated']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Agree']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Discuss', 'Unrelated']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree', 'Unrelated', 'Agree', 'Agree', 'Agree']", "['Agree', 'Agree', 'Agree', 'Agree', 'Agree']", "['Agree', 'Agree', 'Agree', 'Agree', 'Discuss', 'Discuss', 'Agree

Map:   0%|          | 0/637 [00:00<?, ? examples/s]

Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/137 [00:00<?, ? examples/s]

Filter:   0%|          | 0/637 [00:00<?, ? examples/s]

Filter:   0%|          | 0/136 [00:00<?, ? examples/s]

Filter:   0%|          | 0/137 [00:00<?, ? examples/s]

In [5]:
# ============================================
# 3. Tokenization Function
# ============================================
def tokenize_function(example, tokenizer):
    claim = str(example.get("claim", ""))
    article = str(example.get("article", ""))
    return tokenizer(
        claim,
        article,
        truncation=True,
        padding="max_length",
        max_length=512
    )

In [None]:
# ============================================
# 4. Model Configurations
# ============================================
model_ids = {
    "gemma": "google/gemma-7b",
    "bert": "bert-base-multilingual-cased"
}

results = {}
all_preds = {}
all_labels = {}

for name, model_id in model_ids.items():
    print(f"\n===== Starting model: {name} =====")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenized_ds = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    tokenized_ds = tokenized_ds.rename_column("label", "labels")
    tokenized_ds.set_format("torch")

    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=len(LABELS),
        id2label=id2label,
        label2id=label2id
    )

    args = TrainingArguments(
        output_dir=f"outputs_{name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        learning_rate=2e-5,
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        report_to="none"
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        all_preds[name] = preds
        all_labels[name] = labels
        acc = accuracy_score(labels, preds)
        precision, recall, f1_macro, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
        precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds, average="micro", zero_division=0)
        print("\nClassification Report:\n", classification_report(labels, preds, target_names=LABELS))
        return {
            "accuracy": acc,
            "precision_macro": precision,
            "recall_macro": recall,
            "f1_macro": f1_macro,
            "precision_micro": precision_micro,
            "recall_micro": recall_micro,
            "f1_micro": f1_micro
        }

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_results = trainer.evaluate(tokenized_ds["test"])
    print(f"\n===== Final Test Results for {name} =====")
    print(eval_results)

    logs_df = pd.DataFrame(trainer.state.log_history)
    logs_df.to_csv(f"training_logs_{name}.csv", index=False)
    results[name] = eval_results


===== Starting model: gemma =====


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/637 [00:00<?, ? examples/s]

In [None]:
# ============================================
# 5. Summary + Visualization
# ============================================
summary = pd.DataFrame(results).T
print("\n===== Final Summary Comparison =====")
print(summary)

summary[["accuracy", "f1_macro", "f1_micro"]].plot(kind="bar", figsize=(10,6))
plt.title("Gemma vs BERT on AraStance")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

In [1]:
# ============================================
# 6. Confusion Matrix and Per-Class F1
# ============================================
for model_name in all_preds:
    y_true = all_labels[model_name]
    y_pred = all_preds[model_name]
    print(f"\nConfusion Matrix for {model_name}:")
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=LABELS)
    disp.plot(xticks_rotation=45, cmap="Blues")
    plt.title(f"Confusion Matrix - {model_name}")
    plt.tight_layout()
    plt.show()

    print(f"\nPer-class F1 scores for {model_name}:")
    _, _, f1_scores, _ = precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0)
    plt.figure(figsize=(10, 5))
    sns.barplot(x=LABELS, y=f1_scores)
    plt.title(f"Per-class F1 Score - {model_name}")
    plt.ylabel("F1 Score")
    plt.xlabel("Class")
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()