# **Installing the required Packages**

In [None]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install transformers



# **Importing the required Libraries and tool**

In [None]:
import pandas as pd
import torch
import re
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import transformers



from transformers import EarlyStoppingCallback, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling, AutoModelForMaskedLM

from datasets import Dataset
from google.colab import drive
from collections import Counter
from sklearn.model_selection import train_test_split


## **Data Preparation for Modeling**

In [None]:

# Mounting Google Drive
drive.mount("/content/drive", force_remount=True)

# Defining the data file path in Google Drive. Replace it with the actual file path
drive_file_path = "/content/drive/MyDrive/######"

# Reading the CSV file into a DataFrame
preprocessed_df = pd.read_csv(drive_file_path)

# Displaying the DataFrame
preprocessed_df

***Utilizing GPU Capabilities***

In [None]:
import torch

# Checking if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available. Running on GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Running on CPU.")
    device = torch.device("cpu")


## **Preparing the data to apply the further pretraining for ARBERT**

In [None]:

# Renaming Columns
preprocessed_df.rename(columns={'Sector': 'sector', 'Keyword': 'keyword', 'Processed_Content':'text'}, inplace=True)



***Preparing Training and Validation Dataset***

In [None]:
# Creating a combined column for stratification to be used for creating the validation dataset
preprocessed_df['stratify_col'] = preprocessed_df['sector'] + '_' + preprocessed_df['keyword']

# Adopting ARBERT tokenizer
# Loading the model
ARBERT_model = AutoModelForMaskedLM.from_pretrained("UBC-NLP/ARBERT")
ARBERT_tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/ARBERT")



***Some EDA to understand the Token (Article) length distribution in the data which will affect the ARBERT max_length property***

In [None]:
# Visualizing the Distribution of Article Lengths to decide on the max_length

# Step 1: Calculate Article Lengths
article_lengths = []

# Tokenize and calculate length for each article
for review_text in preprocessed_df['text']:
    # Tokenize the article
    tokens = ARBERT_tokenizer.tokenize(review_text)
    # Calculate the length (number of tokens) of the article
    length = len(tokens)
    article_lengths.append(length)


# Step 2: Summary Statistics
mean_length = sum(article_lengths) / len(article_lengths)
median_length = sorted(article_lengths)[len(article_lengths) // 2]
min_length = min(article_lengths)
max_length = max(article_lengths)

# Step 3: Visualize the Distribution
plt.figure(figsize=(9, 5))

ax = sns.histplot(article_lengths, bins=50, alpha=0.6, color='royalblue', edgecolor='black', kde=True)
ax.axvline(mean_length, color='k', linestyle='--') # mean
ax.axvline(median_length, color='b', linestyle='--', ) # median

plt.title('Distribution of Article Lengths', fontsize=20, fontweight='bold')
plt.xlabel('Length of Article', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.grid(True)
plt.show()



print("Mean Article Length:", mean_length)
print("Median Article Length:", median_length)
print("Minimum Article Length:", min_length)
print("Maximum Article Length:", max_length)


In [None]:
def tokenize_function(examples): # Setting the max_length to 400 based on the previous Article Length Distribution
    
    return ARBERT_tokenizer.batch_encode_plus(examples['text'],
                                              return_attention_mask=True,
                                              padding='max_length',  # Pad to the maximum sequence length
                                              truncation=True,  # Truncate sequences longer than the maximum sequence length
                                              return_tensors='pt',  # Return PyTorch tensors
                                              max_length=400,  # Adjust max_length as needed
                                              add_special_tokens=True, # Add special tokens
                                              return_special_tokens_mask=True)


def prepare_data(data, test_size, stratify_column ):
  # Stratified split ensuring each category is proportionally represented
  # The test size here was set to 0.1 to split the data into 90% train and 100% for validation
  train_df_mlm, eval_df_mlm = train_test_split(data, test_size=test_size, stratify=stratify_column, random_state=1)

  # Dropping the stratify column
  train_df_mlm = train_df_mlm.drop(columns=['stratify_column'])
  eval_df_mlm = eval_df_mlm.drop(columns=['stratify_column'])

  # Converting the split DataFrames to Dataset objects
  train_dataset_mlm = Dataset.from_pandas(train_df_mlm)
  eval_dataset_mlm = Dataset.from_pandas(eval_df_mlm)


  # Tokenize the train and val datasets
  tokenized_train_dataset_final_mlm = train_dataset_mlm.map(tokenize_function, batched=True, remove_columns=['']) # Remove extra columns as needed
  tokenized_eval_dataset_mlm = eval_dataset_mlm.map(tokenize_function, batched=True, remove_columns=['']) # Remove extra columns as needed

  # Remove the unnecessary column '__index_level_0__' if present
  if '__index_level_0__' in tokenized_train_dataset_final_mlm.column_names:
     tokenized_train_dataset_final_mlm = tokenized_train_dataset_final_mlm.remove_columns(['__index_level_0__'])
  if '__index_level_0__' in tokenized_eval_dataset_mlm.column_names:
     tokenized_eval_dataset_mlm = tokenized_eval_dataset_mlm.remove_columns(['__index_level_0__'])

  return tokenized_train_dataset_final_mlm, tokenized_eval_dataset_mlm



In [None]:
# Defining data collators for MLM
data_collator_mlm = DataCollatorForLanguageModeling(tokenizer=ARBERT_tokenizer, mlm=True, mlm_probability=0.15,  return_tensors='pt')


# **Modeling: Further Pretraining ARBERT for MLM**

In [None]:
# Initializing logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Custom logging callback
class CustomCallback(transformers.TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        # Custom logging every 100 steps
        if state.global_step % 100 == 0:
            logger.info(f"Step {state.global_step}: Continuing training...")


In [None]:
# Specify output directory in Google Drive for checkpoints
output_dir_checkpoints = "/content/drive/MyDrive/#########"

final_output_dir_checkpoints_DAPT = "/content/drive/MyDrive/#########"

In [None]:
from transformers import TrainerCallback
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=3):
        self.early_stopping_patience = early_stopping_patience
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if state.is_world_process_zero:
            current_metric = metrics["eval_loss"]
            if self.best_metric is None or current_metric > self.best_metric:
                self.best_metric = current_metric
                self.patience_counter = 0
            else:
                self.patience_counter += 1

            if self.patience_counter >= self.early_stopping_patience:
                control.should_training_stop = True

***Defining Training Arguments***

In [None]:
import math

# Function to compute Perplexity from Loss
def compute_metrics(eval_preds):
    loss = eval_preds["eval_loss"]
    perplexity = math.exp(loss)
    return {"perplexity": perplexity}

# Hyperparameter settings and seed can be decided as needed

def training_model_with_validation(tokenized_train_dataset_mlm, tokenized_eval_dataset_mlm, learning_rate, num_epochs, batch_size, weight_decay, warmup_proportion, seed=1):
  # Calculate total steps and warmup steps
  total_steps = len(tokenized_train_dataset_mlm) // batch_size * num_epochs
  warmup_steps = int(total_steps * warmup_proportion)

  training_args = TrainingArguments(
    output_dir = output_dir_checkpoints,
    overwrite_output_dir=True,
    resume_from_checkpoint=False,
    num_train_epochs= num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    logging_steps=1000,
    logging_dir='./logs',
    save_strategy='steps',        # Save based on steps
    save_steps=1000,
    save_total_limit=2,
    gradient_accumulation_steps=2,
    fp16=True,  # Enable mixed precision training
    seed=seed,
    lr_scheduler_type="linear",  # Use linear learning rate decay
    eval_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="perplexity",
    dropout=0.1


  )

  # Creating Trainer object for MLM and start training
  final_trainer_mlm  = Trainer(
    model=ARBERT_model,
    args=training_args,
    data_collator=data_collator_mlm,
    train_dataset=tokenized_train_dataset_mlm,
    eval_dataset=tokenized_eval_dataset_mlm,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
  )

  # Start pretraining for MLM
  final_trainer_mlm.train()



### **Plotting Training-Validation Losses**

In [None]:
# loss data for the adopted training data

training_losses = []  # Add the logged training loss values here
validation_losses = []  # Add the validation loss values from logs or callbacks
steps = [] # Add the number of training steps

plt.figure(figsize=(10, 5))
plt.plot(steps, training_losses, label='Training Loss')
plt.plot(steps, validation_losses, label='Validation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss over Steps')
plt.grid(True)
plt.show()


## **Plotting Perplexity**

In [None]:
perplexity = [] # Add the perplexity values per steps
steps = []  # Add the number of training steps

plt.figure(figsize=(10, 5))
plt.plot(steps, perplexity, label='Perplexity')
plt.xlabel('Steps')
plt.ylabel('Perplexity')
plt.legend()
plt.title('Perplexity over Steps')
plt.grid(True)
plt.show()

### **Retrain and Save the Model and Tokenizer to Google Drive**

In [None]:
import os

def training_model_without_validation(full_dataset_mlm, learning_rate, num_epochs, batch_size, weight_decay, warmup_proportion, seed = 1):
  # Calculate total steps and warmup steps
  total_steps = len(full_dataset_mlm) // batch_size * num_epochs
  warmup_steps = int(total_steps * warmup_proportion)

  training_args = TrainingArguments(
    output_dir = final_output_dir_checkpoints_DAPT,
    overwrite_output_dir=True,
    resume_from_checkpoint=False,
    num_train_epochs= num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    logging_steps=1000,
    logging_dir='./logs',
    save_strategy='steps',        # Save based on steps
    save_steps=1000,
    save_total_limit=2,
    gradient_accumulation_steps=2,
    fp16=True,  # Enable mixed precision training
    evaluation_strategy="no",  # Disable evaluation
    seed=seed,
    lr_scheduler_type="linear",  # Use linear learning rate decay
    dropout=0.1


  )

  # Creating Trainer object for MLM and start training
  final_trainer_mlm  = Trainer(
    model=ARBERT_model,
    args=training_args,
    data_collator=data_collator_mlm,
    train_dataset=full_dataset_mlm,
    eval_dataset=None,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
  )

  # Start pretraining for MLM
  final_trainer_mlm.train()

  # Define the save path to Google Drive
  model_save_path = "/content/drive/MyDrive/#####"
  os.makedirs(model_save_path, exist_ok=True)
  # Save the best model and tokenizer to Google Drive
  final_trainer_mlm.model.save_pretrained(model_save_path)
  ARBERT_tokenizer.save_pretrained(model_save_path)




