# Encoder Classifier Training
Training a classifier using a pretrained autoencoder as a base

In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
from sklearn.metrics import f1_score
import os

### Importing the data

In [3]:
DIRECTORY = "data/Regular_Processed/With_Ordinal_Encoding/All_Positive/"

X_train = np.load(os.path.join(DIRECTORY, "X_train.npy"))
X_valid = np.load(os.path.join(DIRECTORY, "X_valid.npy"))
y_train = np.load(os.path.join(DIRECTORY, "y_train.npy"))
y_valid = np.load(os.path.join(DIRECTORY, "y_valid.npy"))
X_test = np.load(os.path.join(DIRECTORY, "X_test.npy"))

for arr in [X_train, X_valid, y_train, y_valid, X_test]:
    print(arr.shape)

(988, 35)
(247, 35)
(988,)
(247,)
(824, 35)


In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


### Defining the model

In [11]:
PATH_TO_MODELS = "models/With_Ordinal_Encoding/All_Positive/"
MODEL_NAME = "EnClass_35_12_6_ReLU_Sigmoid_10"  #InputColumns_EmbeddingSize_HiddenSize_ActivationFunction(Encoder&Hidden)_ActivationFunctionFinal_Dropout%

class Encoder(nn.Module):
    def __init__(self, input_columns=66, output_size=18, hidden_size=33, dropout_p=0.1, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.ffn = nn.Sequential(
            nn.Linear(input_columns, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size, output_size),
            nn.ReLU()
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.ffn(x)
    
class En_Classifier(nn.Module):
    def __init__(self, input_columns=35, hidden_size_encoder=21, embedding_size = 12, hidden_size=6, output_size=3, dropout_p=0.1, encoder_checkpoint_path: str=None, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.encoder = Encoder(input_columns=input_columns, output_size=embedding_size, hidden_size=hidden_size_encoder, dropout_p=dropout_p)
        if encoder_checkpoint_path is not None:
            self.encoder.load_state_dict(torch.load(encoder_checkpoint_path))
        self.classifier = nn.Sequential(
            nn.Linear(embedding_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size, output_size),
            nn.Sigmoid()
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embeddings = self.encoder(x)
        return self.classifier(embeddings)

In [6]:
class Horse_Health_Dataset(Dataset):
    def __init__(self, x: np.ndarray, y: np.ndarray):
        if x.shape[0] != y.shape[0]:
            raise Exception("Dataset Error: Sizes of X and y dont match")
        
        x_tensor = torch.from_numpy(x)
        y_tensor = torch.from_numpy(y)
        self.X = x_tensor.to(device)
        self.y = y_tensor.to(device)
        self.length = x.shape[0]

    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
def create_dataloader(X, y, batch_size=128):
    """Returns a torch dataloader for the given dataset and batch_size"""
    dataset = Horse_Health_Dataset(X, y)
    dataloaders = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

    return dataloaders

### Training Pipeline

In [7]:
class EarlyStopper:
    """Implements Early Stoppage of training when there is not progress in validation set"""
    def __init__(self, patience=10, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [8]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, train_correct, train_f1 = 0, 0, 0
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        y = y.type(torch.LongTensor).to(device)
        X = X.type(torch.FloatTensor).to(device)
        pred = model(X)
        # print(f"pred size = {str(pred.size())}")
        # print(f"y size = {y.size()}")
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_correct += (torch.argmax(pred, dim=1) == y).sum().item()
        train_f1 += f1_score(y_pred=torch.argmax(pred, dim=1), y_true=y, average='micro')

    train_loss /= num_batches
    train_correct /= size
    train_f1 /= num_batches

    return train_loss, train_correct, train_f1

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, test_correct, test_f1 = 0, 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            y = y.type(torch.LongTensor).to(device)
            X = X.type(torch.FloatTensor).to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            test_correct += (torch.argmax(pred, dim=1) == y).sum().item()
            test_f1 += f1_score(y_pred=torch.argmax(pred, dim=1), y_true=y, average='micro')
            
    test_loss /= num_batches
    test_correct /= size
    test_f1 /= num_batches
    
    return test_loss, test_correct, test_f1

def train(train_dataloader: DataLoader, validation_dataloader: DataLoader, model: nn.Module, loss_fn, optimizer, epochs=100, patience=5):
    early_stopper = EarlyStopper(patience=patience)
    tr_loss, tr_accuracy, tr_f1 = [], [], []
    va_loss, va_accuracy, va_f1 = [], [], []
    for t in range(epochs):
        train_loss, train_correct, train_f1 = train_loop(train_dataloader, model, loss_fn, optimizer)
        valid_loss, valid_correct, valid_f1 = test_loop(validation_dataloader, model, loss_fn)

        tr_loss.append(train_loss), tr_accuracy.append(train_correct), tr_f1.append(train_f1)
        va_loss.append(valid_loss), va_accuracy.append(valid_correct), va_f1.append(valid_f1)

        print(f"Epoch {t+1}: Train_accuracy: {(100*train_correct):>0.2f}%, Train_loss: {train_loss:>8f} Train_F1_batchwise: {train_f1:>0.2f}, Validation_accuracy: {(100*valid_correct):>0.2f}%, Validation_loss: {valid_loss:>8f}, Validation_F1_batchwise :{valid_f1:>0.2f}")

        if (t + 1) % 5 == 0:
            torch.save(model.state_dict(), f"{PATH_TO_MODELS}/{MODEL_NAME}_epoch_{t+1}.pt")

        if early_stopper.early_stop(valid_loss):
            print("Early Stopping Cutoff!")
            break

    return tr_accuracy, tr_loss, tr_f1, va_accuracy, va_loss, va_f1

### Training the architecture without the pretrained encoder weights

In [12]:
# Create the dataloaders
train_dataloader = create_dataloader(batch_size=128, X=X_train, y=y_train)
valid_dataloader = create_dataloader(batch_size=128, X=X_valid, y=y_valid)

# Create the model
model = En_Classifier()
model.to(device)

# Define the optimizer and loss function
LEARNING_RATE = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()  # Cannot use Cross Entropy Loss with Softmax

# Train the model
train_accuracy, train_loss, train_f1, valid_accuracy, valid_loss, valid_f1 = train(train_dataloader=train_dataloader, validation_dataloader=valid_dataloader,
                                                                                   model=model, loss_fn=loss_fn, optimizer=optimizer, epochs=100, patience=7)

Epoch 1: Train_accuracy: 30.06%, Train_loss: 1.097959 Train_F1_batchwise: 0.31, Validation_accuracy: 40.08%, Validation_loss: 1.078447, Validation_F1_batchwise :0.40
Epoch 2: Train_accuracy: 47.27%, Train_loss: 1.034670 Train_F1_batchwise: 0.47, Validation_accuracy: 42.51%, Validation_loss: 1.063895, Validation_F1_batchwise :0.43
Epoch 3: Train_accuracy: 48.99%, Train_loss: 1.008877 Train_F1_batchwise: 0.49, Validation_accuracy: 48.58%, Validation_loss: 1.014528, Validation_F1_batchwise :0.49
Epoch 4: Train_accuracy: 55.77%, Train_loss: 0.954915 Train_F1_batchwise: 0.56, Validation_accuracy: 54.25%, Validation_loss: 0.965341, Validation_F1_batchwise :0.54
Epoch 5: Train_accuracy: 58.40%, Train_loss: 0.930941 Train_F1_batchwise: 0.59, Validation_accuracy: 54.66%, Validation_loss: 0.961243, Validation_F1_batchwise :0.55
Epoch 6: Train_accuracy: 61.03%, Train_loss: 0.912406 Train_F1_batchwise: 0.61, Validation_accuracy: 56.28%, Validation_loss: 0.951634, Validation_F1_batchwise :0.56
Epoc

In [13]:
# Check final F1 Score
final_valid_dataloader = create_dataloader(batch_size=len(X_valid), X=X_valid, y=y_valid)
test_loss, test_correct, test_f1 = test_loop(final_valid_dataloader, model, loss_fn)
print(f"Final Validation F1 Score is: {test_f1}")

Final Validation F1 Score is: 0.6356275303643725


### Training with the pretrained encoder weights

In [10]:
# Create the dataloaders
train_dataloader = create_dataloader(batch_size=128, X=X_train, y=y_train)
valid_dataloader = create_dataloader(batch_size=128, X=X_valid, y=y_valid)

# Create the model
model = En_Classifier(encoder_checkpoint_path="models/AutoEncoder/AutoEn_66_18_ReLU_10_justEncoder_epoch_45.pt")
model.to(device)

# Define the optimizer and loss function
LEARNING_RATE = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()  # Cannot use Cross Entropy Loss with Softmax

# Train the model
train_accuracy, train_loss, train_f1, valid_accuracy, valid_loss, valid_f1 = train(train_dataloader=train_dataloader, validation_dataloader=valid_dataloader,
                                                                                   model=model, loss_fn=loss_fn, optimizer=optimizer, epochs=100, patience=7)

Epoch 1: Train_accuracy: 44.13%, Train_loss: 1.073614 Train_F1_batchwise: 0.44, Validation_accuracy: 54.66%, Validation_loss: 1.013204, Validation_F1_batchwise :0.55
Epoch 2: Train_accuracy: 44.74%, Train_loss: 1.017569 Train_F1_batchwise: 0.45, Validation_accuracy: 57.89%, Validation_loss: 0.936946, Validation_F1_batchwise :0.58
Epoch 3: Train_accuracy: 58.40%, Train_loss: 0.943994 Train_F1_batchwise: 0.58, Validation_accuracy: 66.80%, Validation_loss: 0.883458, Validation_F1_batchwise :0.67
Epoch 4: Train_accuracy: 65.59%, Train_loss: 0.896909 Train_F1_batchwise: 0.66, Validation_accuracy: 64.78%, Validation_loss: 0.867224, Validation_F1_batchwise :0.65
Epoch 5: Train_accuracy: 67.51%, Train_loss: 0.860222 Train_F1_batchwise: 0.68, Validation_accuracy: 66.80%, Validation_loss: 0.873647, Validation_F1_batchwise :0.67
Epoch 6: Train_accuracy: 68.52%, Train_loss: 0.851586 Train_F1_batchwise: 0.68, Validation_accuracy: 68.02%, Validation_loss: 0.850407, Validation_F1_batchwise :0.68
Epoc

In [11]:
# Check final F1 Score
final_valid_dataloader = create_dataloader(batch_size=len(X_valid), X=X_valid, y=y_valid)
test_loss, test_correct, test_f1 = test_loop(final_valid_dataloader, model, loss_fn)
print(f"Final Validation F1 Score is: {test_f1}")

Final Validation F1 Score is: 0.6680161943319838


### Simple Classifier Model

In [20]:
PATH_TO_MODELS = "models/With_Ordinal_Encoding/All_Positive/"
MODEL_NAME = "NNClass_35_12_6_Sigmoid_Sigmoid_10"  #InputColumns_EmbeddingSize_HiddenSize_ActivationFunction(Encoder&Hidden)_ActivationFunctionFinal_Dropout%
    
class NN_Classifier(nn.Module):
    def __init__(self, input_columns=35, embedding_size = 12, hidden_size=6, output_size=3, dropout_p=0.1, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.classifier = nn.Sequential(
            nn.Linear(input_columns, embedding_size),
            nn.Sigmoid(),
            nn.Dropout(dropout_p),
            nn.Linear(embedding_size, hidden_size),
            nn.Sigmoid(),
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size, output_size),
            nn.Sigmoid()
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.classifier(x)

In [21]:
# Create the dataloaders
train_dataloader = create_dataloader(batch_size=128, X=X_train, y=y_train)
valid_dataloader = create_dataloader(batch_size=128, X=X_valid, y=y_valid)

# Create the model
model = NN_Classifier()
model.to(device)

# Define the optimizer and loss function
LEARNING_RATE = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()  # Cannot use Cross Entropy Loss with Softmax

# Train the model
train_accuracy, train_loss, train_f1, valid_accuracy, valid_loss, valid_f1 = train(train_dataloader=train_dataloader, validation_dataloader=valid_dataloader,
                                                                                   model=model, loss_fn=loss_fn, optimizer=optimizer, epochs=100, patience=7)

# Check final F1 Score
final_valid_dataloader = create_dataloader(batch_size=len(X_valid), X=X_valid, y=y_valid)
test_loss, test_correct, test_f1 = test_loop(final_valid_dataloader, model, loss_fn)
print(f"Final Validation F1 Score is: {test_f1}")

Epoch 1: Train_accuracy: 35.43%, Train_loss: 1.096637 Train_F1_batchwise: 0.36, Validation_accuracy: 41.70%, Validation_loss: 1.088151, Validation_F1_batchwise :0.42
Epoch 2: Train_accuracy: 47.17%, Train_loss: 1.068463 Train_F1_batchwise: 0.47, Validation_accuracy: 42.11%, Validation_loss: 1.078060, Validation_F1_batchwise :0.42
Epoch 3: Train_accuracy: 47.57%, Train_loss: 1.053587 Train_F1_batchwise: 0.47, Validation_accuracy: 42.11%, Validation_loss: 1.077504, Validation_F1_batchwise :0.42
Epoch 4: Train_accuracy: 47.57%, Train_loss: 1.041486 Train_F1_batchwise: 0.48, Validation_accuracy: 42.11%, Validation_loss: 1.075436, Validation_F1_batchwise :0.42
Epoch 5: Train_accuracy: 47.57%, Train_loss: 1.036722 Train_F1_batchwise: 0.47, Validation_accuracy: 42.11%, Validation_loss: 1.075579, Validation_F1_batchwise :0.42
Epoch 6: Train_accuracy: 47.57%, Train_loss: 1.030790 Train_F1_batchwise: 0.48, Validation_accuracy: 42.11%, Validation_loss: 1.064564, Validation_F1_batchwise :0.42
Epoc

In [23]:
# test_dataloader = create_dataloader(batch_size=128, X=X_test, y=y_valid)
test_x_tensor = torch.from_numpy(X_test)
test_x_tensor = test_x_tensor.type(torch.FloatTensor).to(device)
pred = model(test_x_tensor)
y_pred = torch.argmax(pred, dim=1)

In [26]:
import pandas as pd

PRED_CATEGORIES = ['died', 'euthanized', 'lived']
pred_list = [PRED_CATEGORIES[el] for el in y_pred.tolist()]

submission_df = pd.DataFrame()
df_test_raw = pd.read_csv("data/test.csv")
submission_df['id'] = df_test_raw['id']
submission_df['outcome'] = pred_list

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [28]:
DIR = "submissions/With_Ordinal_Encoding/All_Positive/"
submission_df.to_csv(os.path.join(DIR, "NN_35_12_6_Sigmoid_Sigmoid_epoch_60.csv"), index=False)