In [None]:
# Arabic QA Classification using AAFAQ and AraBERT

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from datasets import Dataset

In [None]:
# Load Dataset
df = pd.read_csv("AAFAQ_Dataset.csv")

In [None]:
# Select Features for Classification
target_columns = [
    "QuestionParticleType", "QuestionType", "List", "AnswerType",
    "Intent", "CognitiveLevel", "Subjectivity", "TemporalContext", "PurposeContext"
]

In [None]:
# Encode Labels
label_encoders = {}
for col in target_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
# Create Combined Label Column
df['labels'] = df[target_columns].values.tolist()

In [None]:
# Train-Test Split
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [None]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")

def tokenize(batch):
    return tokenizer(batch['QuestionText'], padding="max_length", truncation=True, max_length=128)

In [None]:
# Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df[['QuestionText', 'labels']])
test_ds = Dataset.from_pandas(test_df[['QuestionText', 'labels']])

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

In [None]:
# Prepare Labels
def format_labels(example):
    example['labels'] = torch.tensor(example['labels'], dtype=torch.float)
    return example

train_ds = train_ds.map(format_labels)
test_ds = test_ds.map(format_labels)

In [None]:
# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    "aubmindlab/bert-base-arabertv02",
    problem_type="multi_label_classification",
    num_labels=len(target_columns),
)

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no"
)

In [None]:
# Metrics
def compute_metrics(pred):
    preds = torch.sigmoid(torch.tensor(pred.predictions)).numpy() > 0.5
    labels = pred.label_ids
    return {
        "macro_f1": classification_report(labels, preds, output_dict=True, zero_division=0)['macro avg']['f1-score']
    }

In [None]:
# Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Train
trainer.train()

In [None]:
# Evaluate
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

In [None]:
# Save Model
model.save_pretrained("./arabert-aafaq-model")
tokenizer.save_pretrained("./arabert-aafaq-model")