In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"


In [2]:
import logging
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)

In [3]:
from models.data import ArabicSocialMediaDataModule

In [9]:
# Initialize the data module
# data_module = HC3TextDataModule()
data_module = ArabicSocialMediaDataModule()

data_module.setup()

In [5]:
# Define the model (you can switch between different models)

from models.models import LitXLMRobertaModel
import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    EarlyStopping,
)


In [6]:
class CrossModelExperiment:
    def __init__(self, max_epochs=100, fit_model=True):
        self.max_epochs = max_epochs
        self.fit_model = fit_model
        self.results = {}
        self.chcekpoints_path = "trained_detectors/Arabic/ArabicSocialMediaDataset/{train_model}AIDetector/checkpoints"
        
    def _get_callbacks(self, train_model):
        early_stopping = EarlyStopping(
            monitor="val_loss",
            min_delta=0.0,
            patience=3,
            verbose=True,
            mode="min",
        )
        
        checkpoint = ModelCheckpoint(
            monitor="val_loss",
            dirpath=self.chcekpoints_path.format(train_model=train_model.title()),
            filename="best-checkpoint",
            save_top_k=1,
            mode="min",
        )
        
        return [early_stopping, checkpoint]
    
    def _test_on_model(self, trainer, model, test_model, train_model):
        # Load the best checkpoint before testing
        checkpoint_path = self.chcekpoints_path.format(train_model=train_model.title())
        checkpoint_path +='/best-checkpoint.ckpt'
        model = LitXLMRobertaModel.load_from_checkpoint(checkpoint_path)
        
        test_datamodule = ArabicSocialMediaDataModule(models=[test_model])
        test_datamodule.setup()
        
        results = trainer.test(model, test_datamodule.test_dataloader())[0]
        return {
            'accuracy': results['test_acc'],
            'precision': results['test_precision'],
            'recall': results['test_recall'],
            'f1': results['test_f1'],
            'loss': results['test_loss']
        }
    
    def run_experiment(self, train_model, test_models):        
        # Initialize components
        model = LitXLMRobertaModel()
        train_datamodule = ArabicSocialMediaDataModule(models=[train_model])
        trainer = pl.Trainer(
            devices=1,
            max_epochs=self.max_epochs,
            accelerator="auto",
            val_check_interval=0.25,
            check_val_every_n_epoch=1,
            callbacks=self._get_callbacks(train_model),
        )
        
        # Train the model
        if self.fit_model:
            print(f"\nTraining on {train_model} data...")
            trainer.fit(model, train_datamodule)
        
        # Test on all specified models
        results = {}
        for test_model in test_models:
            print(f"\nTesting on {test_model} data...")
            results[test_model] = self._test_on_model(trainer, model, test_model, train_model)
        
        # Store results
        self.results[train_model] = results
        
        # Display results
        self._display_results(train_model, results)
        
        # Print checkpoint location
        checkpoint_dir = self.chcekpoints_path.format(train_model=train_model.title())
        print(f"\nBest model checkpoint saved at: {checkpoint_dir}/best-checkpoint.ckpt")
        
        return results
    
    def _display_results(self, train_model, results):
        print(f"\nResults for model trained on {train_model}:")
        print("-" * 80)
        print(f"{'Test Model':<15} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1':<10} {'Loss':<10}")
        
        print("-" * 80)
        for test_model, metrics in results.items():
            print(
                f"{test_model:<15}"
                f"{metrics['accuracy']:<10.4f}"
                f"{metrics['precision']:<10.4f}"
                f"{metrics['recall']:<10.4f}"
                f"{metrics['f1']:<10.4f}"
                f"{metrics['loss']:<10.4f}"
            )
        print("-" * 80)

In [8]:
available_models = ["allam", "jais-batched", "llama-batched", "openai"]
experiment = CrossModelExperiment(fit_model=True)

all_results = {}
for train_model in available_models:
    print(f"\n{'='*50}")
    print(f"Training on {train_model}")
    print(f"{'='*50}")
    results = experiment.run_experiment(
        train_model=train_model,
        test_models=available_models
    )
    all_results[train_model] = results


Training on allam


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Training on allam data...


You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name            | Type                                | Params | Mode 
---------------------------------------------------------------------------------
0  | val_accuracy    | BinaryAccuracy                      | 0      | train
1  | test_accuracy   | BinaryAccuracy                      | 0      | train
2  | train_accuracy  | BinaryAccuracy                      | 0      | train
3  | xlm_roberta     | XLMRobertaForSequenceClassification | 278 M  | eval 
4  | fc              | Linear                              | 3      | train
5  | activation      | Sigmoid                           

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/majed_alshaibani/Projects/ai-content-detection-dataset/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=254` in the `DataLoader` to improve performance.
/home/majed_alshaibani/Projects/ai-content-detection-dataset/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=254` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.543


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.055 >= min_delta = 0.0. New best score: 0.487


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.053 >= min_delta = 0.0. New best score: 0.434


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.042 >= min_delta = 0.0. New best score: 0.392


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.391


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.058 >= min_delta = 0.0. New best score: 0.332


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.016 >= min_delta = 0.0. New best score: 0.316


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.313


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.046 >= min_delta = 0.0. New best score: 0.266


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 3 records. Best score: 0.266. Signaling Trainer to stop.



Testing on allam data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/majed_alshaibani/Projects/ai-content-detection-dataset/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=254` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on jais-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on llama-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on openai data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Results for model trained on allam:
--------------------------------------------------------------------------------
Test Model      Accuracy   Precision  Recall     F1         Loss      
--------------------------------------------------------------------------------
allam          0.9197    0.9529    0.8708    0.9069    0.2248    
jais-batched   0.9578    0.9549    0.9493    0.9506    0.1527    
llama-batched  0.7620    0.9313    0.5485    0.6831    0.5205    
openai         0.9729    0.9557    0.9821    0.9680    0.1288    
--------------------------------------------------------------------------------

Best model checkpoint saved at: outputs/Arabic/ArabicSocialMediaDataset/AllamAIDetector/checkpoints/best-checkpoint.ckpt

Training on jais-batched


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Training on jais-batched data...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name            | Type                                | Params | Mode 
---------------------------------------------------------------------------------
0  | val_accuracy    | BinaryAccuracy                      | 0      | train
1  | test_accuracy   | BinaryAccuracy                      | 0      | train
2  | train_accuracy  | BinaryAccuracy                      | 0      | train
3  | xlm_roberta     | XLMRobertaForSequenceClassification | 278 M  | eval 
4  | fc              | Linear                              | 3      | train
5  | activation      | Sigmoid                             | 0      | train
6  | train_precision | BinaryPrecision                     | 0      | train
7  | val_precision   | BinaryPrecision                     | 0      | train
8  | test_precision  | BinaryPrecision                     | 0      | train
9  | train_recall    | BinaryRecall                        | 0      | train
10 | val_recall      | BinaryRecall    

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.479


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.104 >= min_delta = 0.0. New best score: 0.375


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.031 >= min_delta = 0.0. New best score: 0.344


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.061 >= min_delta = 0.0. New best score: 0.282


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.052 >= min_delta = 0.0. New best score: 0.231


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.020 >= min_delta = 0.0. New best score: 0.210


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.032 >= min_delta = 0.0. New best score: 0.178


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.030 >= min_delta = 0.0. New best score: 0.148


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.020 >= min_delta = 0.0. New best score: 0.127


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.022 >= min_delta = 0.0. New best score: 0.105


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 0.095


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 0.084


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.079


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.009 >= min_delta = 0.0. New best score: 0.070


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.063


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.061


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.059


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 3 records. Best score: 0.059. Signaling Trainer to stop.



Testing on allam data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on jais-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on llama-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on openai data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Results for model trained on jais-batched:
--------------------------------------------------------------------------------
Test Model      Accuracy   Precision  Recall     F1         Loss      
--------------------------------------------------------------------------------
allam          0.8835    0.9661    0.7808    0.8585    0.5381    
jais-batched   0.9749    0.9707    0.9667    0.9680    0.0856    
llama-batched  0.7520    0.9480    0.5107    0.6558    0.7817    
openai         0.9458    0.9695    0.9091    0.9363    0.2151    
--------------------------------------------------------------------------------

Best model checkpoint saved at: outputs/Arabic/ArabicSocialMediaDataset/Jais-BatchedAIDetector/checkpoints/best-checkpoint.ckpt

Training on llama-batched


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Training on llama-batched data...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name            | Type                                | Params | Mode 
---------------------------------------------------------------------------------
0  | val_accuracy    | BinaryAccuracy                      | 0      | train
1  | test_accuracy   | BinaryAccuracy                      | 0      | train
2  | train_accuracy  | BinaryAccuracy                      | 0      | train
3  | xlm_roberta     | XLMRobertaForSequenceClassification | 278 M  | eval 
4  | fc              | Linear                              | 3      | train
5  | activation      | Sigmoid                             | 0      | train
6  | train_precision | BinaryPrecision                     | 0      | train
7  | val_precision   | BinaryPrecision                     | 0      | train
8  | test_precision  | BinaryPrecision                     | 0      | train
9  | train_recall    | BinaryRecall                        | 0      | train
10 | val_recall      | BinaryRecall    

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.660


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.103 >= min_delta = 0.0. New best score: 0.557


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.217 >= min_delta = 0.0. New best score: 0.340


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.152 >= min_delta = 0.0. New best score: 0.188


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.080 >= min_delta = 0.0. New best score: 0.107


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.031 >= min_delta = 0.0. New best score: 0.076


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.009 >= min_delta = 0.0. New best score: 0.067


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.063


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.009 >= min_delta = 0.0. New best score: 0.054


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.053


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.046


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 3 records. Best score: 0.046. Signaling Trainer to stop.



Testing on allam data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on jais-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on llama-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on openai data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Results for model trained on llama-batched:
--------------------------------------------------------------------------------
Test Model      Accuracy   Precision  Recall     F1         Loss      
--------------------------------------------------------------------------------
allam          0.6516    0.9574    0.2868    0.4315    1.2646    
jais-batched   0.6355    0.9593    0.2530    0.3909    1.3174    
llama-batched  0.9910    0.9881    0.9856    0.9866    0.0474    
openai         0.6004    0.8862    0.1807    0.2925    1.4417    
--------------------------------------------------------------------------------

Best model checkpoint saved at: outputs/Arabic/ArabicSocialMediaDataset/Llama-BatchedAIDetector/checkpoints/best-checkpoint.ckpt

Training on openai


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Training on openai data...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name            | Type                                | Params | Mode 
---------------------------------------------------------------------------------
0  | val_accuracy    | BinaryAccuracy                      | 0      | train
1  | test_accuracy   | BinaryAccuracy                      | 0      | train
2  | train_accuracy  | BinaryAccuracy                      | 0      | train
3  | xlm_roberta     | XLMRobertaForSequenceClassification | 278 M  | eval 
4  | fc              | Linear                              | 3      | train
5  | activation      | Sigmoid                             | 0      | train
6  | train_precision | BinaryPrecision                     | 0      | train
7  | val_precision   | BinaryPrecision                     | 0      | train
8  | test_precision  | BinaryPrecision                     | 0      | train
9  | train_recall    | BinaryRecall                        | 0      | train
10 | val_recall      | BinaryRecall    

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.382


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.098 >= min_delta = 0.0. New best score: 0.284


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.052 >= min_delta = 0.0. New best score: 0.233


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.060 >= min_delta = 0.0. New best score: 0.173


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.099 >= min_delta = 0.0. New best score: 0.074


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.024 >= min_delta = 0.0. New best score: 0.050


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 3 records. Best score: 0.050. Signaling Trainer to stop.



Testing on allam data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on jais-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on llama-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on openai data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Results for model trained on openai:
--------------------------------------------------------------------------------
Test Model      Accuracy   Precision  Recall     F1         Loss      
--------------------------------------------------------------------------------
allam          0.8926    0.9825    0.7837    0.8659    0.4095    
jais-batched   0.9438    0.9837    0.8890    0.9324    0.1959    
llama-batched  0.7129    0.9730    0.4154    0.5720    1.0617    
openai         0.9809    0.9846    0.9669    0.9751    0.0664    
--------------------------------------------------------------------------------

Best model checkpoint saved at: outputs/Arabic/ArabicSocialMediaDataset/OpenaiAIDetector/checkpoints/best-checkpoint.ckpt
