In [1]:
import pandas as pd

import torch
import torch.nn as nn  # neural network
import torch.nn.functional as F  # raw function
import torch.optim as optim  # optimizer
from torch.utils.data import TensorDataset, DataLoader

from copy import deepcopy
from datetime import datetime

from sklearn.model_selection import train_test_split

# Neural Network

In [2]:
# My early stopping class
class EarlyStopping:
    def __init__(self, patience=7, delta=0.001):
        """
        Early stops the training if validation loss doesn't improve after a given patience.
        :param patience: How long to wait after last time validation loss improved.
        :param delta: Minimum change in the monitored quantity to qualify as an improvement.
        """
        self.patience = patience
        self.counter = 0
        self.total_run = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        self.model = None

    def __call__(self, val_loss, model):
        """
        Early stopping function.
        :param val_loss: the validation loss current
        :param model: the model to save
        :return: true is early stopping should be used
        """
        self.total_run += 1
        print(f"Run number {self.total_run} ", end="")
        # if the best score is not set, set it and save the model
        if self.best_score is None:
            print(f"Initial validation loss: {val_loss}")
            self.best_score = val_loss
            self.model = deepcopy(model)
            return False, self.model

        # if the score is worse than the best score, increment the counter
        elif val_loss > self.best_score - self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            # if the counter is greater than the patience, stop the training
            if self.counter >= self.patience:
                return True, self.model

        # if the score is better than the best score, reset the counter and save the model
        else:
            print(f"Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}).  Saving model ..."
                  f"run since best: {self.counter}")
            self.best_score = val_loss
            self.model = deepcopy(model)
            self.counter = 0
            return False, self.model

        return False, self.model


In [3]:
# Neural Network class
class Net(nn.Module):
    """Neural Network class"""
    def __init__(self, input_size, h1, h2, h3):
        """
        Initialized the neural network with 3 hidden layers, with different sizes
        :param input_size: the size of the input layer
        :param h1: the size of the first hidden layer
        :param h2: size of the second hidden layer
        :param h3: size of the third hidden layer
        """
        super().__init__()
        self.fc1 = nn.Linear(input_size, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.fc3 = nn.Linear(h2, h3)
        self.fc4 = nn.Linear(h3, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return torch.sigmoid(x)

In [4]:
def ham_accuracy(y_pred, y_true):
    """
    Calculate the accuracy of the ham
    :param y_pred: the prediction
    :param y_true: the true value
    :return: the accuracy of the ham
    """
    # round the prediction
    y_pred = torch.round(y_pred)
    # see if its a ham if it is check if pred is correct
    total = 0
    correct = 0
    for i in range(len(y_true)):
        if y_true[i] == 0: # its ham so check if pred is correct
            total += 1
            if y_pred[i] == 0:
                correct += 1

    return correct/total

In [5]:
# Data Config
EPOCH = 1000
CUTOFF = 0.01
# percent saved for validation
VAL = 0.2

In [6]:

list_of_file = ["balanced_trim_compiled_10.csv", "balanced_trim_compiled_20.csv", "balanced_trim_compiled_30.csv",
                "balanced_trim_compiled_40.csv", "balanced_trim_compiled_50.csv", "balanced_trim_compiled_60.csv",
                "balanced_trim_compiled_70.csv", "balanced_trim_compiled_80.csv", "balanced_trim_compiled_90.csv",
                "balanced_trim_compiled_100.csv", "balanced_trim_compiled_145.csv"]

h1_list = [64, 128]
h2_list = [64, 128, 256]
h3_list = [64, 128]
lr_list = [0.001, 0.01]
batch_size_list = [32, 64]

total_run = len(list_of_file) * len(h1_list) * len(h2_list) * len(h3_list) * len(lr_list) * len(batch_size_list)
run_count = 0

# Validation data
VAL = 0.2

# best Parameters
best_score = 0.0
best_params = {"h1": 0, "h2": 0, "h3": 0, "lr": 0, "batch_size": 0, "val_loss": 1000, "file": ""}
best_model = None

start_time = datetime.now()

# try different file sets
for file in list_of_file:
    # load and split data
    dataset = pd.read_csv(file)
    n_feature = pd.read_csv(file).shape[1] - 1

    # get equal number of spam and ham
    dataset = dataset.sample(frac=1)
    dataset = dataset.groupby("spam").head(min(dataset["spam"].value_counts()))

    train_data, val_data = train_test_split(dataset, test_size=VAL, random_state=42)

    train_y = train_data["spam"].values
    train_x = train_data.drop(["spam"], axis=1).values
    val_y = val_data["spam"].values
    val_x = val_data.drop(["spam"], axis=1).values

    # grid search part
    for batch_size in batch_size_list:
        # create the data loaders
        train_data = TensorDataset(torch.from_numpy(train_x).float(), torch.from_numpy(train_y).float())
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
        val_data = TensorDataset(torch.from_numpy(val_x).float(), torch.from_numpy(val_y).float())
        val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

        # try different hyper parameters
        for h1 in h1_list:
            for h2 in h2_list:
                for h3 in h3_list:
                    for lr in lr_list:
                        run_count += 1
                        # create the model
                        model = Net(n_feature, h1, h2, h3)
                        # create the optimizer
                        optimizer = optim.Adam(model.parameters(), lr=lr)
                        # create the loss function
                        loss_function = nn.BCELoss()

                        # Configure the early stopping
                        early_stopping = EarlyStopping(patience=10, delta=0.001)

                        # start training for EPOCH epochs
                        print(f"[{run_count} / {total_run}] starting a new model, feature = {n_feature}, h1 = {h1}, h2 = {h2}, h3 = {h3}, lr = {lr}, batch_size = {batch_size}")
                        for epoch in range(EPOCH):
                            print(f"   Epoch {epoch + 1} of {EPOCH} \n    ", end="")

                            # train the model
                            for data in train_loader:
                                X, y = data
                                model.zero_grad()
                                output = model(X)
                                loss = loss_function(output, y.view(-1, 1))
                                loss.backward()
                                optimizer.step()

                            # validate the model
                            val_loss = 0
                            y_pred = []
                            y_true = []
                            total_output = []
                            for data in val_loader:
                                X, y = data
                                output = model(X)
                                # save the output just in case its done
                                y_pred.append(output)
                                y_true.append(y)
                                # calculate the loss
                                loss = loss_function(output, y.view(-1, 1))
                                val_loss += loss.item()
                            val_loss /= len(val_loader)

                            # early stopping
                            early_stop, model = early_stopping(val_loss, model)
                            if early_stop:
                                print("   Early stopping")
                                # check if the model is better than the best model
                                ham_a = ham_accuracy(torch.cat(y_pred), torch.cat(y_true))
                                if ham_a > best_score:
                                    print(f"   SUCCESS: model is better than the best model {ham_a} > {best_score}")
                                    best_score = ham_a
                                    best_params["h1"] = h1
                                    best_params["h2"] = h2
                                    best_params["h3"] = h3
                                    best_params["lr"] = lr
                                    best_params["batch_size"] = batch_size
                                    best_params["val_loss"] = val_loss
                                    best_params["file"] = file
                                    best_model = model
                                else:
                                    print(f"   FAILURE: model is not better than the best model {ham_a} < {best_score}")

                                print("Been running for: ", datetime.now() - start_time)
                                print()
                                break # exit the loop

# print the best model
print()
print(f"BEEP BOOP COMPLETED THIS TOOK {datetime.now() - start_time}")
print(f"Best model score: {best_score}")
print(f"Best model: {best_params}")

[1 / 528] starting a new model, feature = 9, h1 = 64, h2 = 64, h3 = 64, lr = 0.001, batch_size = 32
   Epoch 1 of 1000 
    Run number 1 Initial validation loss: 0.2394098496879451
   Epoch 2 of 1000 
    Run number 2 EarlyStopping counter: 1 out of 10
   Epoch 3 of 1000 
    Run number 3 EarlyStopping counter: 2 out of 10
   Epoch 4 of 1000 
    Run number 4 EarlyStopping counter: 3 out of 10
   Epoch 5 of 1000 
    Run number 5 EarlyStopping counter: 4 out of 10
   Epoch 6 of 1000 
    Run number 6 EarlyStopping counter: 5 out of 10
   Epoch 7 of 1000 
    Run number 7 EarlyStopping counter: 6 out of 10
   Epoch 8 of 1000 
    Run number 8 EarlyStopping counter: 7 out of 10
   Epoch 9 of 1000 
    Run number 9 EarlyStopping counter: 8 out of 10
   Epoch 10 of 1000 
    Run number 10 EarlyStopping counter: 9 out of 10
   Epoch 11 of 1000 
    Run number 11 EarlyStopping counter: 10 out of 10
   Early stopping
   SUCCESS: model is better than the best model 0.8666345690900337 > 0.0
Bee

In [7]:
# save the best model
torch.save(best_model.state_dict(), "best_model.pt")