# Setup

## Loading that data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import random


train = pd.read_csv('./ae-data/training_set.csv')
test = pd.read_csv('./ae-data/test_set.csv')

## Altered Dictionary

The "Harder to explain" variables are:

IMD_Decile_From_LSOA - IMD Decile Description. Goes from most deprived (1) to least deprived (10). Best to set to 5 if missing for now. The IMD Overall Ranking to identify which one of ten groups a Super Output Area belongs to, from most deprived through to least deprived.

Sex - Based on the data dictionary, 1 for Male and 2 for Female, 9 for Indeterminate. Make 0 (unknown) for missing.

AE_HRG - No idea. Not sure if it's important. Leaving it out.


# Preprocessing

## Handle missing values

In [None]:
## Removing columns with excessive number of null values
## THIS CELL IS VERY MUCH EDITABLE

# Define the threshold for missing values
missing_threshold = 0.5

# Identify columns in the training set that have more than 50% missing values
# Not dropping after some exploratory analysis. When it's present it is a good (perfect?) indicator of if someone has been admitted.

columns_to_drop_train = train.columns[((train.isnull().mean() > missing_threshold) & (train.columns != 'Length_Of_Stay_Days')) | (train.columns == 'AE_Arrive_HourOfDay')]

print(columns_to_drop_train)

# Drop these columns from the training set
train_drop = train.drop(columns_to_drop_train, axis=1)

# Drop the same columns from the test set
test_drop = test.drop(columns_to_drop_train, axis=1)

In [None]:
# A few acceptable data transformations
# Set any null values in the 'Sex' column of test and train drop to 0

train_drop['Sex'] = train_drop['Sex'].fillna(0)

test_drop['Sex'] = test_drop['Sex'].fillna(0)

# Set any null values in the 'Provider_Patient_Distance_Miles' column of test and train drop to the mean of the column

train_drop['Provider_Patient_Distance_Miles'] = train_drop['Provider_Patient_Distance_Miles'].fillna(int(train_drop['Provider_Patient_Distance_Miles'].mean()))

test_drop['Provider_Patient_Distance_Miles'] = test_drop['Provider_Patient_Distance_Miles'].fillna(int(train_drop['Provider_Patient_Distance_Miles'].mean()))


# Set any null values in the 'IMD_Decile_From_LSOA' column of test and train drop to 5. This is the median value of the column.
train_drop['IMD_Decile_From_LSOA'] = train_drop['IMD_Decile_From_LSOA'].fillna(5)

test_drop['IMD_Decile_From_LSOA'] = test_drop['IMD_Decile_From_LSOA'].fillna(5)


# Set any null values of 'Length_Of_Stays_Days' to 0. Throught it would make more sense for them not to have been put in the database rather than have stayed and not been recorded.

train_drop['Length_Of_Stay_Days'] = train_drop['Provider_Patient_Distance_Miles'].fillna(0)

test_drop['Length_Of_Stay_Days'] = test_drop['Provider_Patient_Distance_Miles'].fillna(0)


# Replace 'NaN' in "EA_HRG" with the value "Nothing"

train_drop['AE_HRG'] = train_drop['AE_HRG'].fillna('Nothing')

test_drop['AE_HRG'] = test_drop['AE_HRG'].fillna('Nothing')


In [None]:
# ## Data removals
# ## Planning to remove all null values for Arrival Hour. This is because it feels like a pretty important feature and there are only a couple hundred missing, doubt it will sway things too much.

# train_drop = train_drop.dropna(subset=['AE_Arrive_HourOfDay'])

# test_drop = test_drop.dropna(subset=['AE_Arrive_HourOfDay'])

In [None]:
# Count of missing values in the training set
print(train_drop.isnull().sum())

# Count of missing values in the test set
print(test_drop.isnull().sum())

## Train/Val Splitting

In [None]:
## Splitting the training data into training and validation sets

# Set the proportion of the dataset to include in the test split
test_size = 0.20
# Set the proportion of the training dataset to include in the validation split
validation_size = 0.25

# Separate the features and the target variable in the training set
X_train = train_drop.drop('Admitted_Flag', axis=1)
y_train = train_drop['Admitted_Flag']

# Drop 'ProvID' from both sets
X_train = X_train.drop('ProvID', axis=1)
output_test = test_drop.drop('ProvID', axis=1)

# Drop 'Record_ID' from the training set
X_train = X_train.drop('Record_ID', axis=1)

# Keep a copy of 'Record_ID' from the test set and then drop it from the test set
test_record_id = output_test['Record_ID']
output_test = output_test.drop('Record_ID', axis=1)

# Split the training data into a smaller training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=test_size, random_state=42)

# Split the training data into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state=42)

## Categorical Data Encoding

The Categroical variables in our dataset are:

- Age Band
- AE_Arrive_HourOfDay
- AE_HRG

We will perform one-hot encoding on the following variables:

- AE_Arrive_HourOfDay
- AE_HRG

And we will perfrom ordinal encoding on the following variables (this is we want to preserve the order of the categories, since they aren't really independent of each other):

- Age Band (There is a natural ordering to the age bands. If this fucks up classification downstream, we should try making this one-hot as well)

In [None]:
# train_encoded = pd.get_dummies(train_drop, columns=['AE_Arrive_HourOfDay','AE_HRG'])
X_train = pd.get_dummies(X_train, columns=['AE_HRG'])
X_test = pd.get_dummies(X_test, columns=['AE_HRG'])
X_val = pd.get_dummies(X_val, columns=['AE_HRG'])

encoder = OrdinalEncoder()

encoder.fit(X_train[['Age_Band']])

X_train['Age_Band'] = encoder.transform(X_train[['Age_Band']])
X_test['Age_Band'] = encoder.transform(X_test[['Age_Band']])
X_val['Age_Band'] = encoder.transform(X_val[['Age_Band']])


## Date Processing

In [None]:
# First convert to datetime format
X_train['AE_Arrive_Date'] = pd.to_datetime(X_train['AE_Arrive_Date'])
X_test['AE_Arrive_Date'] = pd.to_datetime(X_test['AE_Arrive_Date'])
X_val['AE_Arrive_Date'] = pd.to_datetime(X_val['AE_Arrive_Date'])

# Then extract date components
for df in [X_train, X_test, X_val]:
    df['Arrival_Year'] = df['AE_Arrive_Date'].dt.year
    df['Arrival_Month'] = df['AE_Arrive_Date'].dt.month
    df['Arrival_Day'] = df['AE_Arrive_Date'].dt.day
    df['Arrival_DayOfWeek'] = df['AE_Arrive_Date'].dt.dayofweek  # Monday=0, Sunday=6

# Drop the original 'AE_Arrive_Date' field
X_train.drop('AE_Arrive_Date', axis=1, inplace=True)
X_test.drop('AE_Arrive_Date', axis=1, inplace=True)
X_val.drop('AE_Arrive_Date', axis=1, inplace=True)

## Feature Scaling

In [None]:
# Choose standardization or min-max scaling
# This is not needed for decision trees and random forests as they are not affected by the scale of the data
# Needed for logistic regression, SVM, perceptron, kNN, neural networks, etc.

# scaler = StandardScaler() - resulting distribution has a mean of 0 and a standard deviation of 1
scaler = MinMaxScaler()  # - transforms your data to a range between 0 and 1

# Fit on training data
scaler.fit(X_train)

# Transform both training and test data
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

## RUN, THAT (HYPERPARAMETER GRID SEARCH) MODEL

In [None]:
# # Define hyperparameter grids for each model
# param_grid_lr = {
#     'lr__C': [5, 10, 15, 20],
#     'lr__penalty': ['l1', 'l2'],
#     'lr__solver': ['liblinear']
# }

# param_grid_rf = {
#     'rf__n_estimators': [200, 250, 300],
#     'rf__max_depth': [None, 8, 10, 12, 14],
#     'rf__min_samples_split': [2, 5, 10],
#     'rf__min_samples_leaf': [1, 2, 4]
# }

# param_grid_svm = {
#     'svm__C': [5, 10, 15, 20],
#     'svm__kernel': ['linear', 'rbf', 'poly'],
#     'svm__degree': [2, 3, 4]
# }


# # Random Forests pipeline
# pipeline_rf = Pipeline([
#     ('rf', RandomForestClassifier(random_state=42))
# ])

# # SVM pipeline
# pipeline_svm = Pipeline([
#     ('svm', SVC(random_state=42))
# ])

# # Logistic Regression pipeline
# pipeline_lr = Pipeline([
#     ('lr', LogisticRegression(random_state=42))
# ])

# param_grids = [param_grid_rf, param_grid_svm, param_grid_lr]
# pipelines = [pipeline_rf, pipeline_svm, pipeline_lr]
# pipeline_names = ['Random Forests', 'SVM', 'Logistic Regression']

# for i, (pipeline, param_grid) in enumerate(zip(pipelines, param_grids)):
#     grid_search = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
#     # Fit the pipeline on the training data
#     grid_search.fit(X_train, y_train)

#     best_params = grid_search.best_params_
#     best_pipeline = grid_search.best_estimator_   

#     # Display the best hyperparameters
#     print(f"Best hyperparameters for {pipeline_names[i]}: {best_params}")
     
#     # Make predictions on the validation data
#     y_val_pred = best_pipeline.predict(X_val)
    
#     # Evaluate the predictions
#     accuracy = accuracy_score(y_val, y_val_pred)
    
#     print(f'{pipeline_names[i]} validation accuracy: {accuracy}')


## Slightly more advanced models

### Neural Network Class Functions

In [None]:
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function 1: input_dim --> 128
        self.fc1 = nn.Linear(input_dim, 128) 
        # Non-linearity 1
        self.relu1 = nn.LeakyReLU(negative_slope=0.01)
        
        # Linear function 2: 128 --> 64
        self.fc2 = nn.Linear(128, 64)
        # Non-linearity 2
        self.relu2 = nn.LeakyReLU(negative_slope=0.01)

        # Linear function 3: 64 --> 32
        self.fc3 = nn.Linear(64, 32)
        # Non-linearity 3
        self.relu3 = nn.LeakyReLU(negative_slope=0.01)
        
        # Linear function 4 (readout): 32 --> 1
        self.fc4 = nn.Linear(32, 1)
        # Sigmoid function
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.fc4(out)
        out = self.sigmoid(out)
        return out

### Run the Neural Nets

In [None]:
device = torch.device("mps" if torch.torch.backends.mps.is_available() else "cpu")

In [None]:
# Hyperparameters for the neural network

learning_rates = [1e-3, 1e-4, 1e-5]
batch_sizes = [32, 64, 128, 256]
optimizers = [optim.Adam, optim.RMSprop, optim.SGD]
epochs_range = [50, 200, 500]


In [None]:
# Convert the variables to work with Torch

input_dim = X_train.shape[1]
model = FeedforwardNeuralNetModel(input_dim)

criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_train_tensor = y_train_tensor.view(y_train_tensor.shape[0], 1)  # Reshaping to match output shape

X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
y_val_tensor = y_val_tensor.view(y_val_tensor.shape[0], 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

In [None]:
# Define a function to train and validate the model
def train_and_validate(model, criterion, optimizer, train_loader, val_loader, epochs, trial):
    best_accuracy = 0.0
    best_epoch = 0
    best_val_loss = float('inf')
    patience_counter = 0
    patience = 10

    for epoch in range(epochs):
        model.train()  # Set model to training mode
        total_train_loss = 0

        for i, (X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{epochs}], Average Training Loss: {avg_train_loss:.4f}')

        model.eval()  # Set model to evaluation mode
        total_val_loss = 0
        correct_preds = 0
        total_preds = 0

        with torch.no_grad():

            for X_val_batch, y_val_batch in val_loader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)

                val_outputs = model(X_val_batch)
                val_loss = criterion(val_outputs, y_val_batch)
                total_val_loss += val_loss.item()

                val_preds = (val_outputs > 0.5).float()
                correct_preds += (val_preds == y_val_batch).sum().item()
                total_preds += y_val_batch.size(0)

        avg_val_loss = total_val_loss / len(val_loader)
        avg_val_accuracy = correct_preds / total_preds

        # Check if validation loss has improved
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1

        # Check if we have waited for too long without improvement.
        # Sign of overfitting
        if patience_counter > patience:
            print(f"Early stopping at epoch {epoch}. Best loss was {best_val_loss:.4f}")
            break

        if avg_val_accuracy > best_accuracy:
            best_accuracy = avg_val_accuracy
            best_epoch = epoch + 1

        print(f'Epoch [{epoch+1}/{epochs}], Average Validation Loss: {avg_val_loss:.4f}, Average Validation Accuracy: {avg_val_accuracy:.4f}')

    print(f'Best Validation Accuracy: {best_accuracy:.4f} at epoch {best_epoch} for trial {trial}')

    return best_accuracy



In [None]:
best_accuracy = 0
best_params = {}
NUM_TRAILS = 10
input_dim = X_train.shape[1]

for t in range(NUM_TRAILS):
    lr = random.choice(learning_rates)
    batch_size = random.choice(batch_sizes)
    optimizer_choice = random.choice(optimizers)
    epochs = random.choice(epochs_range)

    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

    model = FeedforwardNeuralNetModel(input_dim).to(device)
    optimizer = optimizer_choice(model.parameters(), lr=lr)
    criterion = nn.BCELoss()

    accuracy = train_and_validate(model, criterion, optimizer, train_loader, val_loader, epochs, t)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {
            "Learning Rate": lr,
            "Batch Size": batch_size,
            "Optimizer": optimizer_choice,
            "Epochs": epochs
        }

print(f"Best accuracy: {best_accuracy}")
print(f"Best parameters: {best_params}")



In [None]:
print(best_params)

Best for Relu activation forward NN : {'Learning Rate': 0.0001, 'Batch Size': 64, 'Optimizer': <class 'torch.optim.rmsprop.RMSprop'>, 'Epochs': 500}

Best for LeakyRelu : 