In [1]:
!pip install pandas numpy torch transformers optuna
!pip install arabert

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curan

# Thiết lập môi trường và thư viện

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import optuna
import os

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Tải và Khám phá dữ liệu

In [3]:
train_df = pd.read_csv('/kaggle/input/daquaxuly/train.csv')
validation_df = pd.read_csv('/kaggle/input/daquaxuly/test.csv')

print("Training data:")
print(train_df.head())
print("\nValidation data:")
print(validation_df.head())

print("\nLabel distribution:")
print(train_df['label'].value_counts())

Training data:
     id                                               text           label
0  8167                       و لانني احب الاشياء الراقيه   not_applicable
1  1532  أَثِقُ فِي قُدْرَتِي عَلَى التَّعَامُلِ مَعَ ا...            hope
2  4710  وروضة بات طل الغيث ينسجها حتى إذا نسجت أضحى يد...  not_applicable
3  6084  أَشْعُرُ بِقَلْبِي العَنِيدِ وَهُوَ يَحَارُبُ ...  not_applicable
4  8968  @MoaElshamy بتحسسني اني مرتضى ده كان بيدي واحد...  not_applicable

Validation data:
     id                                               text
0  5813  AhmedGamal On CBC Sat11pm AhmedGamal On CBC Sa...
1  5853                قهر اللهم اني اعوذ بك من قهر الرجال
2   251  : رفيق السوء مثل البعوض لا تحسّ به الا بعد الل...
3  7213  أما والهوى العذري يا اخت عامر حفظتك في حصنٍ من...
4  6848  الله يلعن حيطتكم الغبيه صغار الشرقيه وستبغون صغار

Label distribution:
label
not_applicable    3578
hope              1813
hate              1246
Name: count, dtype: int64


# Data preprocessing và prepare data

In [4]:
from arabert.preprocess import ArabertPreprocessor

MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"

# Initialize the preprocessor
arabert_prep = ArabertPreprocessor(model_name=MODEL_NAME)

# Preprocess the text data
train_df['preprocessed_text'] = train_df['text'].apply(lambda x: arabert_prep.preprocess(x))
validation_df['preprocessed_text'] = validation_df['text'].apply(lambda x: arabert_prep.preprocess(x))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Map labels to integers
labels = {label: i for i, label in enumerate(train_df['label'].unique())}
inv_labels = {i: label for label, i in labels.items()}
train_df['label'] = train_df['label'].map(labels)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Định nghĩa mô hình phân loại

In [5]:
class AraBERTClassifier(torch.nn.Module):
    def __init__(self, n_classes):
        super(AraBERTClassifier, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        return outputs.logits

# Thiết lập Stratified K-Fold Cross-Validation

In [6]:
def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(
    model,
    data_loader,
    loss_fn,
    device
):
    model = model.eval()
    losses = []
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            losses.append(loss.item())

    acc = accuracy_score(all_targets, all_preds)
    f1 = f1_score(all_targets, all_preds, average='macro')
    return acc, f1, np.mean(losses)

# Sử dụng best params đã tìm được từ Optuna tuning trước đó
best_params = {'learning_rate': 9.742175923406393e-06, 'batch_size': 8, 'epochs': 4}

print(f"Using best params: {best_params}")

# Commented out Optuna optimization (đã chạy trước đó)
"""
def objective(trial):
    # Hyperparameters to tune
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    epochs = trial.suggest_int("epochs", 2, 5)
    patience = 3

    fold_f1_scores = []

    for fold, (train_indices, val_indices) in enumerate(skf.split(train_df['preprocessed_text'], train_df['label'])):
        # Training code for each fold...
        pass

    return np.mean(fold_f1_scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=16)
best_params = study.best_params
"""

Using best params: {'learning_rate': 9.742175923406393e-06, 'batch_size': 8, 'epochs': 4}


'\ndef objective(trial):\n    # Hyperparameters to tune\n    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-5, log=True)\n    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])\n    epochs = trial.suggest_int("epochs", 2, 5)\n    patience = 3\n\n    fold_f1_scores = []\n\n    for fold, (train_indices, val_indices) in enumerate(skf.split(train_df[\'preprocessed_text\'], train_df[\'label\'])):\n        # Training code for each fold...\n        pass\n\n    return np.mean(fold_f1_scores)\n\nstudy = optuna.create_study(direction="maximize")\nstudy.optimize(objective, n_trials=16)\nbest_params = study.best_params\n'

In [7]:
if not os.path.exists('models'):
    os.makedirs('models')

print('Training final model on full dataset')
print('------------------------------------')

# Create dataset from full training data
train_dataset = TextDataset(
    texts=train_df['preprocessed_text'].values,
    labels=train_df['label'].values,
    tokenizer=tokenizer
)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=best_params['batch_size'],
    shuffle=True
)

# Initialize model
model = AraBERTClassifier(len(labels)).to(device)
optimizer = AdamW(model.parameters(), lr=best_params['learning_rate'])
total_steps = len(train_data_loader) * best_params['epochs']
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

# Training loop
for epoch in range(best_params['epochs']):
    print(f'Epoch {epoch + 1}/{best_params["epochs"]}')
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_dataset)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

# Save the final model
torch.save(model.state_dict(), 'models/final_model.bin')
print('Final model saved as models/final_model.bin')

Training final model on full dataset
------------------------------------


config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

2025-07-25 08:00:05.378501: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753430405.564940      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753430405.621032      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4
Train loss 0.7418336396116808 accuracy 0.6403495555220732
Epoch 2/4
Train loss 0.6105539901518678 accuracy 0.7226156395962031
Epoch 3/4
Train loss 0.5146599981170821 accuracy 0.779870423384059
Epoch 4/4
Train loss 0.44326165916330845 accuracy 0.8166340213952087
Final model saved as models/final_model.bin


In [8]:
def get_predictions(model, data_loader):
    model = model.eval()
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        for d in data_loader:
            texts = d["text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            probs = torch.nn.functional.softmax(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(labels)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return predictions, prediction_probs, real_values

# Tạo và Tải Mô hình Ensemble

In [9]:
# Load the final trained model
final_model = AraBERTClassifier(len(labels))
final_model.load_state_dict(torch.load('models/final_model.bin'))
final_model = final_model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Dự đoán trên tập Validation và Tạo file Submission

In [10]:
validation_texts = validation_df['preprocessed_text'].values
# Create a dummy labels array for the validation set
validation_labels = np.zeros(len(validation_texts))

validation_dataset = TextDataset(
    texts=validation_texts,
    labels=validation_labels, # Dummy labels
    tokenizer=tokenizer
)

validation_data_loader = DataLoader(
    validation_dataset,
    batch_size=best_params['batch_size']
)

# Get predictions from the final model
predictions, _, _ = get_predictions(final_model, validation_data_loader)

# Convert predictions to labels
predicted_labels = [inv_labels[p.item()] for p in predictions]

# Create submission file
submission_df = pd.DataFrame({
    'id': validation_df['id'],
    'label': predicted_labels
})

submission_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")
print(submission_df.head())

Submission file created successfully!
     id           label
0  5813  not_applicable
1  5853  not_applicable
2   251  not_applicable
3  7213            hope
4  6848  not_applicable
