# **Installing the required Packages**

In [None]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install transformers
!pip install optuna
!pip install accelerate
!pip install transformers datasets


# **Importing the required Libraries and Tools**

In [None]:
import pandas as pd
import torch
import re
import matplotlib.pyplot as plt
import logging
import transformers
import optuna
import numpy as np
from sklearn.metrics import precision_recall_fscore_support,  classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import seaborn as sns
from torch.utils.data import Subset


from transformers.trainer_utils import set_seed
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainerCallback, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset
from google.colab import drive
from collections import Counter


## **Data Preparation for Modeling**

In [None]:
# Mounting Google Drive
drive.mount("/content/drive", force_remount=True)

# Defining the data file path in Google Drive for the Train and Test splits of the First Scenario
train_data_path = '/content/drive/My Drive/#####'
test_data_path = '/content/drive/My Drive/#####'

# Reading the CSV file into a DataFrame
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)


In [None]:
# Defining the data file path in Google Drive for the Train and Test splits of the Second Scenario
cx_train_data_path = '/content/drive/My Drive/#####'
cx_test_data_path = '/content/drive/My Drive/#####'

# Reading the CSV file into a DataFrame
cx_train_data = pd.read_csv(cx_train_data_path)
cx_test_data = pd.read_csv(cx_test_data_path)



***Utilizing GPU Capabilities***

In [None]:

# Checking if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available. Running on GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Running on CPU.")
    device = torch.device("cpu")


***Adopting ARBERT Model***

In [None]:
# Defining tokenizer

ARBERT_tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/ARBERT")



### **Preparing the data to apply the fine-tuning for general ARBERT**

In [None]:
# Renaming Columns to prepare it for modeling

train_data.rename(columns={'Sector': 'labels',  'Processed_Content':'text'}, inplace=True)
test_data.rename(columns={'Sector': 'labels',  'Processed_Content':'text'}, inplace=True)


In [None]:
cx_train_data.rename(columns={'Sector': 'labels',  'Processed_Content':'text'}, inplace=True)
cx_test_data.rename(columns={'Sector': 'labels',  'Processed_Content':'text'}, inplace=True)

***Some EDA to understand the Token (Review) length distribution in the data which will affect the ARBERT max_length and padding properties***


In [None]:
# Visualizing the Distribution of Review Lengths to decide on the max_length

# Step 1: Calculate Review Lengths
review_lengths = []

preprocessed_df = pd.concat([train_data, test_data], ignore_index=True)

# Tokenize and calculate length for each review
for review_text in preprocessed_df['text']:
    # Tokenize the review
    tokens = ARBERT_tokenizer.tokenize(review_text)
    # Calculate the length (number of tokens) of the review
    length = len(tokens)
    review_lengths.append(length)



import seaborn as sns

# Step 2: Summary Statistics
mean_length = sum(review_lengths) / len(review_lengths)
median_length = sorted(review_lengths)[len(review_lengths) // 2]
min_length = min(review_lengths)
max_length = max(review_lengths)

# Step 3: Visualize the Distribution
plt.figure(figsize=(9, 5))

ax = sns.histplot(review_lengths, bins=50, alpha=0.6, color='royalblue', edgecolor='black', kde=True)
ax.axvline(mean_length, color='k', linestyle='--') # mean
ax.axvline(median_length, color='b', linestyle='--', ) # median

plt.title('Distribution of Review Lengths', fontsize=20, fontweight='bold')
plt.xlabel('Length of Review', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.grid(True)
plt.show()



print("Mean Review Length:", mean_length)
print("Median Review Length:", median_length)
print("Minimum Review Length:", min_length)
print("Maximum Review Length:", max_length)


***Applying Label Encoding***

In [None]:
# Label Encoding 'labels' column to be prepared for the modeling in the First Scenario

# Initializing the label encoder
label_encoder_train = LabelEncoder()
label_encoder_test = LabelEncoder()

# Fitting and transforming the labels to integers
train_data['labels'] = label_encoder_train.fit_transform(train_data['labels'])
test_data['labels'] = label_encoder_test.fit_transform(test_data['labels'])

# Saving the label map for later use (e.g., interpreting model predictions)
label_map_train = dict(zip(label_encoder_train.classes_, label_encoder_train.transform(label_encoder_train.classes_)))
label_map_test = dict(zip(label_encoder_test.classes_, label_encoder_test.transform(label_encoder_test.classes_)))

# Ensure labels are integers
train_data['labels'] = train_data['labels'].astype(int)
test_data['labels'] = test_data['labels'].astype(int)


In [None]:
# Label Encoding 'labels' column to be prepared for the modeling in the Second Scenario

# Initializing the label encoder
label_encoder_cx_train = LabelEncoder()
label_encoder_cx_test = LabelEncoder()

# Fitting and transforming the labels to integers
cx_train_data['labels'] = label_encoder_cx_train.fit_transform(cx_train_data['labels'])
cx_test_data['labels'] = label_encoder_cx_test.fit_transform(cx_test_data['labels'])

# Saving the label map for later use (e.g., interpreting model predictions)
label_map_cx_train = dict(zip(label_encoder_cx_train.classes_, label_encoder_cx_train.transform(label_encoder_cx_train.classes_)))
label_map_cx_test = dict(zip(label_encoder_cx_test.classes_, label_encoder_cx_test.transform(label_encoder_cx_test.classes_)))

# Ensure labels are integers
cx_train_data['labels'] = cx_train_data['labels'].astype(int)
cx_test_data['labels'] = cx_test_data['labels'].astype(int)


***Preparing Train and Eval Datasets for performing Training, Validation, and Evaluation***

In [None]:
def tokenize_function(examples ): # Setting the max_length to 512 based on the previous Review Length Distribution

    return ARBERT_tokenizer.batch_encode_plus(examples['text'],
                                              return_attention_mask=True,
                                              padding='max_length',  # Pad to the maximum sequence length
                                              truncation=True,  # Truncate sequences longer than the maximum sequence length
                                              return_tensors='pt',  # Return PyTorch tensors
                                              max_length=512,  # Adjust max_length as needed
                                              add_special_tokens=True, # Add special tokens
                                              return_special_tokens_mask=True)


def prepare_data(train_data_split, test_data_split):


  # Converting the split DataFrames to Dataset objects
  train_dataset_mlm = Dataset.from_pandas(train_data_split)
  eval_dataset_mlm = Dataset.from_pandas(test_data_split)


  # Tokenize the train and eval datasets
  tokenized_train_dataset_mlm = train_dataset_mlm.map(tokenize_function, batched=True, remove_columns=['']) # Remove extra columns as needed
  tokenized_eval_dataset_mlm = eval_dataset_mlm.map(tokenize_function, batched=True, remove_columns=['']) # Remove extra columns as needed

  # Remove the unnecessary column '__index_level_0__' if present
  if '__index_level_0__' in tokenized_train_dataset_mlm.column_names:
     tokenized_train_dataset_mlm = tokenized_train_dataset_mlm.remove_columns(['__index_level_0__'])
  if '__index_level_0__' in tokenized_eval_dataset_mlm.column_names:
     tokenized_eval_dataset_mlm = tokenized_eval_dataset_mlm.remove_columns(['__index_level_0__'])


  return tokenized_train_dataset_mlm, tokenized_eval_dataset_mlm



In [None]:
# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer = ARBERT_tokenizer, padding = 'max_length', max_length = 512, return_tensors="pt")

# **Modeling: Finetuning ARBERT for Classifying Arabic Government Reviews to their relavant Government Sector**

In [None]:
# Initializing logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Custom logging callback
class CustomCallback(transformers.TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        # Custom logging every 100 steps
        if state.global_step % 100 == 0:
            logger.info(f"Step {state.global_step}: Continuing training...")


In [None]:
# Specify output directory in Google Drive for checkpoints
output_dir_checkpoints = "/content/drive/MyDrive/#####"



***Defining Training Arguments***

In [None]:

class CustomTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss




In [None]:
def class_weights_calculator(tokenized_train_dataset_mlm):
  # 'labels' is the column in the training set
  y_train = tokenized_train_dataset_mlm['labels']

  # Calculate class weights based on training set

  class_counts = np.bincount(y_train)
  class_weights = 1. / class_counts
  class_weights = class_weights / class_weights.sum()  # Normalize to sum to 1
  class_weights = torch.tensor(class_weights, dtype=torch.float32)


  return class_counts, class_weights


In [None]:
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=3):
        self.early_stopping_patience = early_stopping_patience
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if state.is_world_process_zero:
            current_metric = metrics["eval_loss"]
            if self.best_metric is None or current_metric > self.best_metric:
                self.best_metric = current_metric
                self.patience_counter = 0
            else:
                self.patience_counter += 1

            if self.patience_counter >= self.early_stopping_patience:
                control.should_training_stop = True


***Using Optuna to find the Best Hyperparamters and provide Model Validation Results based on the chosen best ones. Optuna is used with 10-Fold Cross Validation***

***This step should be done for each Scenario independently, using its own data splits***


In [None]:
def optuna_hyperparams_tuning_and_validation(tokenized_train_dataset, search_space, data_collator, scenario):

# Assuming search_space is an input provided as a dictionary with keys representing the parameter names and values represnting their respective search spaces

  def compute_metrics(pred):
      labels = pred.label_ids
      preds = pred.predictions.argmax(-1)
      f1 = f1_score(labels, preds, average='macro', zero_division=0)
      return {
          'f1': f1
      }


  def objective(trial):

      # Suggest hyperparameters
      learning_rate = trial.suggest_categorical('learning_rate', search_space['learning_rate'])
      batch_size = trial.suggest_categorical('batch_size', search_space['batch_size'])
      num_epochs = trial.suggest_int('num_epochs', *search_space['num_epochs'])
      warmup_steps = trial.suggest_int('warmup_steps', *search_space['warmup_steps'])

      # Define training arguments
      training_args = TrainingArguments(
          output_dir=output_dir_checkpoints,
          overwrite_output_dir=True,
          eval_strategy='steps',
          learning_rate=learning_rate,
          per_device_train_batch_size= batch_size,
          per_device_eval_batch_size= batch_size,
          num_train_epochs= num_epochs,
          warmup_steps=warmup_steps,
          weight_decay=0.01,
          logging_steps=20,
          logging_dir='./logs',
          save_steps=200,
          save_total_limit=2,
          gradient_accumulation_steps=1,
          fp16=True,
          eval_steps=100,
          load_best_model_at_end=True,
          metric_for_best_model="eval_f1",
          seed=1,
          lr_scheduler_type="linear",
          report_to="none",  # To disable logging to third-party services
      )


      # Early stopping callback
      early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)  # Adjust patience as needed


      # Convert the dataset to a format suitable for StratifiedKFold
      labels = tokenized_train_dataset['labels']
      skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

      f1s = []


      for train_index, val_index in skf.split(np.zeros(len(labels)), labels):
          train_split = tokenized_train_dataset.select(train_index)
          val_split = tokenized_train_dataset.select(val_index)

          # Ensure each split contains all labels and maintains proportionality
          train_labels, train_counts = np.unique(train_split['labels'], return_counts=True)
          val_labels, val_counts = np.unique(val_split['labels'], return_counts=True)

          print(f"Train labels distribution: {dict(zip(train_labels, train_counts))}")
          print(f"Val labels distribution: {dict(zip(val_labels, val_counts))}")

          # Initializing the model inside the loop to reset it for each fold
          model = AutoModelForSequenceClassification.from_pretrained('UBC-NLP/ARBERT', num_labels=10)  # Path to the general ARBERT model
          tokenizer = ARBERT_tokenizer # general ARBERT model previously intialized





          # Defining the Trainer
          trainer = CustomTrainer(
              class_weights = class_weights_calculator(train_split)[1], # extracting the class weights from the function
              model=model,
              args=training_args,
              train_dataset=train_split,
              eval_dataset=val_split,
              data_collator=data_collator,
              tokenizer=tokenizer,
              compute_metrics=compute_metrics,
              callbacks=[early_stopping_callback]  # Add early stopping callback
          )

          # Train the model
          trainer.train()

          # Evaluate the model
          eval_result = trainer.evaluate(eval_dataset=val_split)
          f1s.append(eval_result['eval_f1'])

      mean_f1 = sum(f1s) / len(f1s)

      return mean_f1

  # Create the Optuna study and optimize
  study = optuna.create_study(direction='maximize')
  study.optimize(objective, n_trials=30)  # Run 30 trials

  print('Best trial for the 1st Experimental Scenario:\n:', study.best_trial) if scenario == 1 else print('Best trial for the 2nd Experimental Scenario:\n:', study.best_trial)

  return study.best_trial, study.best_trial.number, study.best_trial.value, study.best_trial.params


***Validating the ARBERT model again on a seperate Validation set just to be able to assess the model training behavior through generating Training and Validation Losses as well as Validation F1 Scores across Steps***

***This step should be done for each Scenario independently, using its own data splits***

In [None]:
def validate_on_a_separate_validation_set(tokenized_train_dataset, params, data_collator, output_dir_checkpoints, scenario):

  # Assuming the params are the best parameters generated from optuna trials and provided as an input in the form a dictionary of parameter names and their values

  # Also, assuming the dataset is a PyTorch Dataset that has been tokenized.
  dataset = tokenized_train_dataset

  # Extract labels for stratified sampling
  labels = dataset['labels']

  # Split the dataset into training and validation sets using stratified sampling
  train_idx, val_idx = train_test_split(
      np.arange(len(labels)),  # Indices of the dataset
      test_size=0.2,  # 20% for validation
      stratify=labels,  # Stratify based on labels
      random_state=2
  )

  # Create train and validation subsets
  train_subset = Subset(dataset, train_idx)
  val_subset = Subset(dataset, val_idx)


  def compute_metrics(p):
      predictions, labels = p
      predictions = np.argmax(predictions, axis=1)

      # Calculate accuracy
      accuracy = accuracy_score(labels, predictions)

      precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
      precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, predictions, average='macro', zero_division=0)

      report = classification_report(labels, predictions, output_dict=True)

      return {

          'f1_macro': f1_macro

            }

  final_output_dir = output_dir_checkpoints


  # Final training with the best hyperparameters
  training_args = TrainingArguments(
      output_dir=final_output_dir,
      overwrite_output_dir=True,
      eval_strategy='steps',
      learning_rate= params['learning_rate'],
      per_device_train_batch_size= params['batch_size'],
      per_device_eval_batch_size= params['batch_size'],
      num_train_epochs= params['num_epochs'],
      weight_decay=0.01,
      warmup_steps= params['warmup_steps'],
      gradient_accumulation_steps=1,
      logging_steps=50,
      logging_dir='./logs',
      save_steps=200,
      save_total_limit=2,
      fp16=True,
      eval_steps=50,
      load_best_model_at_end=True,
      metric_for_best_model = "f1_macro",
      seed=1,
      lr_scheduler_type="linear",
      report_to="none"
  )


  early_stopping = EarlyStoppingCallback(early_stopping_patience=3)


  # Create train and validation subsets
  train_subset = Subset(dataset, train_idx)
  val_subset = Subset(dataset, val_idx)

  model = AutoModelForSequenceClassification.from_pretrained('UBC-NLP/ARBERT', num_labels=10)

  # Apply .contiguous() before saving or checkpointing
  for param in model.parameters():
      param.data = param.data.contiguous()

  # Now save the model or checkpoint
  model.save_pretrained(final_output_dir)



  # Update the Trainer with the new data
  trainer = CustomTrainer(
          class_weights=class_weights_calculator(train_subset)[1],
          model=model,
          args=training_args,
          train_dataset=train_subset,
          eval_dataset=val_subset,
          data_collator=data_collator,
          tokenizer=ARBERT_tokenizer,
          compute_metrics=compute_metrics,
          callbacks=[early_stopping]
      )

  # Train the model on the current fold
  trainer.train()

  # Evaluate the model on the validation set of the current fold
  eval_result = trainer.evaluate(eval_dataset=val_subset)


  # Print the averaged results
  print("Final Validation Results for the 1st Experimental Scenario:") if scenario == 1 else print("Final Validation Results for the 2nd Experimental Scenario:")
  print("--------------------------------\n")
  for metric, value in eval_result.items():
      print(f"{metric}: {value}")
      print("--------------------------------")

  return eval_result


In [None]:
def plot_training_and_validation_curves(logs, scenario):

  # Assuming these logs are obtained from the validate_on_a_separate_validation_set after training and stored in the logs dictionary passed to this function as an input.
  # This is an example of how it looks like:

  # logs = {
  #     "steps": [],  # steps
  #     "train_loss": [], # the logged training loss values
  #     "val_loss": [],  # the logged validation loss values
  #     "val_f1": []  # the logged validation F1 score values
  # }


  # 1. Plot the training and validation loss curve
  plt.figure(figsize=(10, 5))

  # Plot training loss
  plt.plot(logs["steps"], logs["train_loss"], label="Training Loss", marker='o')

  # Plot validation loss
  plt.plot(logs["steps"], logs["val_loss"], label="Validation Loss", marker='o')

  # Labels and title
  plt.xlabel("Steps", fontsize = 14)
  plt.ylabel("Loss", fontsize = 14)
  plt.title("General ARBERT Training and Validation Loss Over Steps (1st Experimental Scenario)", fontsize = 16) if scenario == 1 else plt.title("General ARBERT Training and Validation Loss Over Steps (2nd Experimental Scenario)", fontsize = 16)
  plt.legend()
  plt.grid(True)
  plt.show()

  # 2. Plot the validation F1 score curve
  plt.figure(figsize=(10, 5))

  # Plot validation F1 Macro score
  plt.plot(logs["steps"], logs["val_f1"], label="Validation F1 Macro", color="green", marker='o')

  # Labels and title
  plt.xlabel("Steps",  fontsize = 14)
  plt.ylabel("F1 Score", fontsize = 14)
  plt.title("General ARBERT Validation F1 Macro Over Steps (1st Experimental Scenario)", fontsize = 16) if scenario == 1 else plt.title("General ARBERT Validation F1 Macro Over Steps (2nd Experimental Scenario)", fontsize = 16)
  plt.legend()
  plt.grid(True)
  plt.show()


# **Evaluation**

***This step should be done for each Scenario independently, using its own data splits***

In [None]:
# Manually define the Arabic-to-English translation dictionary
translations = {
    'الاتصالات': 'Communication',
    'البنوك': 'Banking',
    'البيئة': 'Environment',
    'التعليم': 'Education',
    'التموين': 'Supply',
    'الزراعة': 'Agriculture',
    'الصحة': 'Healthcare',
    'القضاء': 'Judiciary',
    'الكهرباء': 'Electricity',
    'المياه والصرف الصحي': 'Water and Sanitation'

}

label_names = [f"{j}" for i, j in translations.items()]


In [None]:
final_output_dir = "/content/drive/MyDrive/#####"

def evaluate(tokenized_train_dataset, tokenized_test_dataset, params, label_names, data_collator, final_output_dir, scenario):

  # Assuming that the params are generated after applying optuna and given as an input in the form of a dictionary with keys referring to paramter names and values corresponsding to their respective values


  def compute_metrics(p):
      predictions, labels = p
      predictions = np.argmax(predictions, axis=1)

      precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
      precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, predictions, average='macro', zero_division=0)

      report = classification_report(labels, predictions, output_dict=True, target_names=label_names)

      # Compute the confusion matrix
      conf_matrix = confusion_matrix(labels, predictions)

      return {
          'classification_report': report,
          'conf_matrix': conf_matrix,
          'f1_macro': f1_macro,
          'f1_weighted': f1_weighted
              }

  # Final training with the best hyperparameters
  training_args = TrainingArguments(
      output_dir=final_output_dir,
      overwrite_output_dir=True,
      eval_strategy='no',
      learning_rate= params['learning_rate'],
      per_device_train_batch_size= params['batch_size'],
      per_device_eval_batch_size= params['batch_size'],
      num_train_epochs= params['num_epochs'],
      weight_decay=0.01,
      warmup_steps= params['warmup_steps'],
      gradient_accumulation_steps=1,
      logging_steps=50,
      logging_dir='./logs',
      save_strategy='steps',
      save_steps=200,
      save_total_limit=2,
      fp16=True,
      seed=1,
      lr_scheduler_type="linear",
      report_to="none",
  )

  model = AutoModelForSequenceClassification.from_pretrained('UBC-NLP/ARBERT', num_labels=10)

  # Apply .contiguous() before saving or checkpointing
  for param in model.parameters():
      param.data = param.data.contiguous()


  # Now save the model
  model.save_pretrained(final_output_dir)

  early_stopping = EarlyStoppingCallback(early_stopping_patience=3)
  trainer = CustomTrainer(
      class_weights=class_weights_calculator(tokenized_train_dataset)[1],
      model=model,
      args=training_args,
      train_dataset= tokenized_train_dataset ,  # Use the entire dataset (Training + Validation) for final training
      data_collator=data_collator,
      tokenizer=ARBERT_tokenizer,
      compute_metrics=compute_metrics,

      # Early stopping callback
      callbacks=[early_stopping]

  )

  trainer.train()


  # Evaluate the final model
  final_eval_result = trainer.evaluate(eval_dataset=tokenized_test_dataset)

  print('Final Evaluation Results of the 1st Experimental Scenario:\n') if scenario == 1 else print('Final Evaluation Results of the 2nd Experimental Scenario:\n')


  for metric, value in final_eval_result.items():
    if metric == 'eval_f1_macro':
          print(f"F1 Score (Macro): {value}\n")
    elif metric == 'eval_f1_weighted':
          print(f"F1 Score (Weighted): {value}\n")
    else:
      print(f"{metric}: {value}\n")

  return final_eval_result


***Checking the Classification Report***

In [None]:
def classification_report_heatmap(eval_classification_report, scenario):
  # Assuming eval_classification_report is extract from the evaluate function

  # Convert the report to a DataFrame
  df_report = pd.DataFrame(eval_classification_report).transpose()

  # Separate the support column to add it as a separate column without color
  support = df_report['support'].iloc[:-1]  # Exclude the last row (accuracy row if needed)

  # Remove the support column from the heatmap to avoid coloring it
  df_report_no_support = df_report.drop(columns='support').iloc[:-1, :]  # Exclude last row (accuracy)

  # Display heatmap
  plt.figure(figsize=(4 ,6))
  ax=sns.heatmap(df_report.iloc[:, :-1], annot=True, cmap="vlag_r" ,fmt=".4f", cbar=True, linewidths=0.5)

  # Overlay the support column on top of the heatmap, without affecting the colors

  # Move the x-axis labels (precision, recall, f1-score) to the top
  plt.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)

  plt.title('General ARBERT Classification Report Heatmap (1st Experimental Scenario)\n', fontsize=12) if scenario == 1 else plt.title('General ARBERT Classification Report Heatmap (2nd Experimental Scenario)\n', fontsize=12)
  plt.show()

  pd.options.display.float_format = '{:.4f}'.format

  print("\n", df_report)

  return df_report


***Checking the Confusion Matrix***

In [None]:
def confusion_matrix_heatmap(eval_conf_matrix, label_names, scenario):
  # Assuming the eval_conf_matrix is extracted from the evaluate function


  plt.figure(figsize=(7, 5))


  # Add labels, title, and adjust the plot
  # Plot confusion matrix with the "Blues" color map
  sns.heatmap(eval_conf_matrix, annot=True, cmap="Blues", fmt="g", xticklabels=label_names, yticklabels=label_names)


  plt.xlabel('Predicted Labels', fontsize=12)
  plt.ylabel('True Labels', fontsize=12)


  plt.title('General ARBERT Confusion Matrix Heatmap (1st Experimental Scenario)\n', fontsize=14) if scenario == 1 else plt.title('General ARBERT Confusion Matrix Heatmap (2nd Experimental Scenario)\n', fontsize=14)
  plt.show()

  return eval_conf_matrix