In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"


In [2]:
import logging
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)

In [3]:
from models.data import ArabicAbstractsDataModule

In [4]:
# Initialize the data module
# data_module = HC3TextDataModule()
data_module = ArabicAbstractsDataModule()

data_module.setup()

In [5]:
# Define the model (you can switch between different models)

from models.models import LitXLMRobertaModel
import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    EarlyStopping,
)


In [10]:
class CrossModelExperiment:
    def __init__(self, max_epochs=100, fit_model=True):
        self.max_epochs = max_epochs
        self.fit_model = fit_model
        self.results = {}
        self.chcekpoints_path = "trained_detectors/Arabic/ArabicAbstractsDataset/{train_model}AIDetector/checkpoints"
        
    def _get_callbacks(self, train_model):
        early_stopping = EarlyStopping(
            monitor="val_loss",
            min_delta=0.0,
            patience=3,
            verbose=True,
            mode="min",
        )
        
        checkpoint = ModelCheckpoint(
            monitor="val_loss",
            dirpath=self.chcekpoints_path.format(train_model=train_model.title()),
            filename="best-checkpoint",
            save_top_k=1,
            mode="min",
        )
        
        return [early_stopping, checkpoint]
    
    def _test_on_model(self, trainer, model, test_model, train_model):
        # Load the best checkpoint before testing
        checkpoint_path = self.chcekpoints_path.format(train_model=train_model.title())
        checkpoint_path +='/best-checkpoint.ckpt'
        model = LitXLMRobertaModel.load_from_checkpoint(checkpoint_path)
        
        test_datamodule = ArabicAbstractsDataModule(models=[test_model])
        test_datamodule.setup()
        
        results = trainer.test(model, test_datamodule.test_dataloader())[0]
        return {
            'accuracy': results['test_acc'],
            'precision': results['test_precision'],
            'recall': results['test_recall'],
            'f1': results['test_f1'],
            'loss': results['test_loss']
        }
    
    def run_experiment(self, train_model, test_models):        
        # Initialize components
        model = LitXLMRobertaModel()
        train_datamodule = ArabicAbstractsDataModule(models=[train_model])
        trainer = pl.Trainer(
            devices=1,
            max_epochs=self.max_epochs,
            accelerator="auto",
            val_check_interval=0.25,
            check_val_every_n_epoch=1,
            callbacks=self._get_callbacks(train_model),
        )
        
        # Train the model
        if self.fit_model:
            print(f"\nTraining on {train_model} data...")
            trainer.fit(model, train_datamodule)
        
        # Test on all specified models
        results = {}
        for test_model in test_models:
            print(f"\nTesting on {test_model} data...")
            results[test_model] = self._test_on_model(trainer, model, test_model, train_model)
        
        # Store results
        self.results[train_model] = results
        
        # Display results
        self._display_results(train_model, results)
        
        # Print checkpoint location
        checkpoint_dir = self.chcekpoints_path.format(train_model=train_model.title())
        print(f"\nBest model checkpoint saved at: {checkpoint_dir}/best-checkpoint.ckpt")
        
        return results
    
    def _display_results(self, train_model, results):
        print(f"\nResults for model trained on {train_model}:")
        print("-" * 80)
        print(f"{'Test Model':<15} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1':<10} {'Loss':<10}")
        
        print("-" * 80)
        for test_model, metrics in results.items():
            print(
                f"{test_model:<15}"
                f"{metrics['accuracy']:<10.4f}"
                f"{metrics['precision']:<10.4f}"
                f"{metrics['recall']:<10.4f}"
                f"{metrics['f1']:<10.4f}"
                f"{metrics['loss']:<10.4f}"
            )
        print("-" * 80)

In [None]:
available_models = ["allam", "jais-batched", "llama-batched", "openai"]
experiment = CrossModelExperiment(fit_model=False)

all_results = {}
for train_model in available_models:
    print(f"\n{'='*50}")
    print(f"Training on {train_model}")
    print(f"{'='*50}")
    results = experiment.run_experiment(
        train_model=train_model,
        test_models=available_models
    )
    all_results[train_model] = results


Training on allam


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Testing on allam data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/majed_alshaibani/Projects/ai-content-detection-dataset/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=254` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on jais-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on llama-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on openai data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Results for model trained on allam:
--------------------------------------------------------------------------------
Test Model      Accuracy   Precision  Recall     F1         Loss      
--------------------------------------------------------------------------------
allam          0.9994    1.0000    0.9992    0.9996    0.0038    
jais-batched   0.9455    1.0000    0.9266    0.9611    0.1336    
llama-batched  0.9051    1.0000    0.8726    0.9307    0.2955    
openai         0.9619    1.0000    0.9480    0.9728    0.0845    
--------------------------------------------------------------------------------

Best model checkpoint saved at: outputs/Arabic/ArabicAbstractsDataset/AllamAIDetector/checkpoints/best-checkpoint.ckpt

Training on jais-batched


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Testing on allam data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on jais-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on llama-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on openai data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Results for model trained on jais-batched:
--------------------------------------------------------------------------------
Test Model      Accuracy   Precision  Recall     F1         Loss      
--------------------------------------------------------------------------------
allam          0.9988    0.9983    1.0000    0.9991    0.0132    
jais-batched   0.9953    0.9983    0.9951    0.9966    0.0220    
llama-batched  0.9766    0.9983    0.9702    0.9838    0.1022    
openai         0.9988    0.9983    1.0000    0.9991    0.0171    
--------------------------------------------------------------------------------

Best model checkpoint saved at: outputs/Arabic/ArabicAbstractsDataset/Jais-BatchedAIDetector/checkpoints/best-checkpoint.ckpt

Training on llama-batched


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Testing on allam data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on jais-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on llama-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on openai data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Results for model trained on llama-batched:
--------------------------------------------------------------------------------
Test Model      Accuracy   Precision  Recall     F1         Loss      
--------------------------------------------------------------------------------
allam          0.9982    0.9983    0.9991    0.9987    0.0235    
jais-batched   0.9900    0.9983    0.9881    0.9930    0.0469    
llama-batched  0.9936    0.9983    0.9929    0.9955    0.0427    
openai         0.9977    0.9983    0.9984    0.9983    0.0243    
--------------------------------------------------------------------------------

Best model checkpoint saved at: outputs/Arabic/ArabicAbstractsDataset/Llama-BatchedAIDetector/checkpoints/best-checkpoint.ckpt

Training on openai


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Testing on allam data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on jais-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on llama-batched data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Testing on openai data...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Results for model trained on openai:
--------------------------------------------------------------------------------
Test Model      Accuracy   Precision  Recall     F1         Loss      
--------------------------------------------------------------------------------
allam          0.9156    1.0000    0.8859    0.9380    0.2170    
jais-batched   0.8266    1.0000    0.7657    0.8639    0.7027    
llama-batched  0.9344    1.0000    0.9114    0.9528    0.2986    
openai         0.9994    1.0000    0.9992    0.9996    0.0026    
--------------------------------------------------------------------------------

Best model checkpoint saved at: outputs/Arabic/ArabicAbstractsDataset/OpenaiAIDetector/checkpoints/best-checkpoint.ckpt


: 