# Train the LLM

In [None]:
# Set a PyTorch environment variable to optimize CUDA memory allocation
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:

import pandas as pd  
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments  # Transformers library for NLP
from torch.utils.data import Dataset                                                 # PyTorch Dataset class
import os  
from google.colab import drive                                                       # For Google Drive integration in Colab
import torch                                                                         # PyTorch for deep learning
from datasets import load                                                            # Hugging Face datasets library (not used in the code but imported)
import torch.nn as nn                                                                # PyTorch neural network module

In [None]:

# Mount Google Drive to access files stored there
drive.mount('/content/drive')

# Define the file path for the dataset in Google Drive
file_path = os.path.join('/content/drive/My Drive/Colab/', "Phishing_validation_emails - Cleaned.xlsx")

# Read the dataset from the Excel file
if not os.path.exists(file_path):
    # Raise an error if the file does not exist
    raise FileNotFoundError(f"File not found at: {file_path}")
data = pd.read_excel(file_path)

# Check for missing values and ensure required columns exist
if data.isnull().values.any():
    print("Warning: Dataset contains missing values. Cleaning data...")
    # Drop rows with missing values
    data = data.dropna()

# Ensure the dataset contains necessary columns
required_columns = ["Email Text", "Email Type", "Word Count"]
for column in required_columns:
    if column not in data.columns:
        raise ValueError(f"Missing required column: {column}")

# Define the percentage of data to use for training

In [None]:

percentage = 0.2        # 0.1 it means 10% of the data will be used for testing

# Split the dataset into training and evaluation sets

In [None]:

num_train_samples = int(len(data) * percentage)
train_data = data.sample(n=num_train_samples, random_state=42)  # Randomly sample training data
eval_data = data.drop(data.index[:num_train_samples])  # Use the remaining data for evaluation

# Custom Dataset class to handle email data

In [None]:

class EmailDataset(Dataset):
    def __init__(self, data):
        # Initialize with data and tokenizer
        self.data = data
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token  # Use the EOS token for padding

    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.data)

    def __getitem__(self, idx):
        # Process a single data sample
        email_text = self.data.iloc[idx]["Email Text"]
        email_type = self.data.iloc[idx]["Email Type"]
        word_count = self.data.iloc[idx]["Word Count"]
        # Create the input text format
        input_text = f"Type: {email_type}, Words: {word_count}\nEmail:\n{email_text}"
        # Tokenize the input text
        tokenized_input = self.tokenizer(input_text, padding="max_length", truncation=True, max_length=64)
        # Return tokenized data and labels
        return {
            "input_ids": torch.tensor(tokenized_input["input_ids"]),
            "attention_mask": torch.tensor(tokenized_input["attention_mask"]),
            "labels": torch.tensor(tokenized_input["input_ids"])
        }


# Load a pre-trained GPT-2 model and tokenizer

In [None]:

model = GPT2LMHeadModel.from_pretrained("gpt2").to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token

# Define training arguments

In [None]:

training_args = TrainingArguments(
    output_dir="./results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=1,  # Batch size for training
    per_device_eval_batch_size=1,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Accumulate gradients for this many steps
    eval_accumulation_steps=4,  # Accumulate evaluation results for this many steps
    save_steps=10,  # Save checkpoint every 10 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    logging_dir="./logs",  # Directory for logs
    report_to="none",  # Disable reporting to external systems (e.g., WandB)
    fp16=False  # Disable 16-bit floating-point precision
)

# Create training and evaluation datasets

In [None]:

train_dataset = EmailDataset(train_data)
eval_dataset = EmailDataset(eval_data)

# Define a custom loss function

In [None]:


def compute_loss(model, inputs, return_outputs=False, **kwargs):
    labels = inputs.get("labels")  # Extract labels from inputs
    outputs = model(**inputs)  # Forward pass
    logits = outputs.get("logits")  # Extract logits
    loss_fct = torch.nn.CrossEntropyLoss()  # Define loss function
    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))  # Compute loss
    return (loss, outputs) if return_outputs else loss

# Define a metric function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred  # Extract logits and labels
    logits = torch.as_tensor(logits)
    labels = torch.as_tensor(labels)
    labels = labels[:, 1:].contiguous().view(-1)  # Adjust labels
    logits = logits[:, :-1].contiguous().view(-1, logits.size(-1))  # Adjust logits
    loss_fct = torch.nn.CrossEntropyLoss()  # Define loss function
    loss = loss_fct(logits, labels)  # Compute evaluation loss
    return {"eval_loss": loss.item()}  # Return loss as a metric


# Initialize the Trainer with custom settings

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics  # Use custom metrics
)

# Use the custom loss function
trainer.compute_loss = compute_loss

# Train the model

In [None]:

trainer.train()

# Evaluate the model

In [None]:

results = trainer.evaluate()
print("Evaluation Results:", results)
print(f"Loss: {results['eval_loss']:.4f}")

# Save the trained model and tokenizer

In [None]:

output_dir = "/content/drive/My Drive/Colab/trained_model"
model.save_pretrained(output_dir)  # Save the model weights and configuration
tokenizer.save_pretrained(output_dir)  # Save the tokenizer configuration
print(f"Model saved to {output_dir}")

