# Multilayer Perceptron for Predicting Airplane Passenger Satisfaction

## Cell 1: Imports
This cell imports the necessary libraries for retrieving the data from Kaggle, creating the model, and testing/visualizing its accuracy.

In [None]:
# Imports
import kagglehub
import pandas
import torch
import sagemaker

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sagemaker.pytorch import PyTorch

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn
import matplotlib.pyplot as plot

## Cell 2: Setup
This cell handles basic setup for creating the model.

In [None]:
# Setup SageMaker
session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = 'sagemaker/airplane-satisfaction'

## Cell 3: Retrieve Data
This cell retrieves the dataset from kaggle and stores the paths for the training and testing portions of the data.

In [None]:
# Get Airplane Satisfaction dataset from Kaggle
path = kagglehub.dataset_download("teejmahal20/airline-passenger-satisfaction")

training_path = f"{path}/train.csv"
testing_path = f"{path}/test.csv"

## Cell 4: Preprocess Data
This cell handles the preprocessing of the data. Columns are encoded into numerical representation and unneeded columns are removed. The data is split into inputs and the output, null values are filled in with the median for their column, and the data is scaled.

In [None]:
# Preprocess data

# First create dataframes
training_DF = pandas.read_csv(training_path)
testing_DF = pandas.read_csv(testing_path)

# Encode columns
encode_gender = LabelEncoder()

training_DF['Gender Encoded'] = encode_gender.fit_transform(training_DF['Gender'])
testing_DF['Gender Encoded'] = encode_gender.transform(testing_DF['Gender'])

encode_customer_type = LabelEncoder()

training_DF['Customer Type Encoded'] = encode_customer_type.fit_transform(training_DF['Customer Type'])
testing_DF['Customer Type Encoded'] = encode_customer_type.transform(testing_DF['Customer Type'])

encode_travel_type = LabelEncoder()

training_DF['Travel Type Encoded'] = encode_travel_type.fit_transform(training_DF['Type of Travel'])
testing_DF['Travel Type Encoded'] = encode_travel_type.transform(testing_DF['Type of Travel'])

encode_satisfaction = LabelEncoder()

training_DF['Satisfaction Encoded'] = encode_satisfaction.fit_transform(training_DF['satisfaction'])
testing_DF['Satisfaction Encoded'] = encode_satisfaction.transform(testing_DF['satisfaction'])

training_DF = pandas.get_dummies(training_DF, columns=['Class'], prefix='Class', dtype=int)
testing_DF = pandas.get_dummies(testing_DF, columns=['Class'], prefix='Class', dtype=int)

# Drop unneeded columns
dropColumns = ['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Type of Travel', 'Inflight wifi service', 'satisfaction']

training_DF = training_DF.drop(columns=dropColumns)
testing_DF = testing_DF.drop(columns=dropColumns)

# Split into X and Y
training_DF_X = training_DF.drop(columns=['Satisfaction Encoded'])
testing_DF_X = testing_DF.drop(columns=['Satisfaction Encoded'])

training_DF_Y = training_DF['Satisfaction Encoded']
testing_DF_Y = testing_DF['Satisfaction Encoded']

# Fill in null values
print(training_DF_X.isnull().sum())
print(testing_DF_X.isnull().sum())

training_DF_X = training_DF_X.fillna({'Arrival Delay in Minutes': training_DF_X['Arrival Delay in Minutes'].median()})
testing_DF_X = testing_DF_X.fillna({'Arrival Delay in Minutes': testing_DF_X['Arrival Delay in Minutes'].median()})

print(training_DF_X.isnull().sum())
print(testing_DF_X.isnull().sum())

# Scale the data
scaler = StandardScaler()

scaled_training_X = scaler.fit_transform(training_DF_X)
scaled_testing_X = scaler.transform(testing_DF_X)

## Cell 5: Upload Data
This cell uploads the data to S3 for use in training and testing the model.

In [None]:
# Upload preprocessed data to S3
scaled_training_DF_X = pandas.DataFrame(scaled_training_X, columns=training_DF_X.columns)

training_final_DF = pandas.concat([training_DF_Y.reset_index(drop=True), scaled_training_DF_X], axis=1)

training_final_DF.to_csv('training_final.csv', header=False, index=False)

training_final_path = session.upload_data('training_final.csv', key_prefix=prefix + '/train')

## Cell 6: Model Dimensions
This cell calculates and displays the number of input columns and the number of unique outputs for use in creating the model.

In [None]:
# Model dimensions
print(scaled_training_DF_X.shape[1])
print(training_DF_Y.nunique())

## Cell 7: Create Training Script
This cell writes the Python file which creates the model including instantiation as well as methods for training and deployment.

In [None]:
%%writefile train.py

import torch
import argparse
import os
import pandas


class MLP(torch.nn.Module):
    # Instantiate model
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    # Takes the data through the layers of the model
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Handles the training of the model
def train(args):
    training_dir = args.data_dir
    training_data = pandas.read_csv(os.path.join(training_dir, 'training_final.csv'), header=None)

    x_train = torch.tensor(training_data.iloc[:, 1:].values).float()
    y_train = torch.tensor(training_data.iloc[:, 0].values).long()

    model = MLP(args.input_dim, args.hidden_dim, args.output_dim)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # Loops through the layers of the model to train on the dataset
    for epoch in range(args.epochs):
        optimizer.zero_grad()
        outputs = model(x_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}/{args.epochs}, Loss: {loss.item():.4f}')

    torch.save(model.state_dict(), os.path.join(args.model_dir, 'model.pth'))

# Loads model
def model_fn(model_dir):
    print("Loading model.")
    model = MLP(input_dim=23, hidden_dim = 100, output_dim=2)
    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f))
    model.eval()
    return model

# Predicts outputs for the given input using the model
def predict_fn(input_data, model):
    print("Making a prediction.")
    with torch.no_grad():
        return model(input_data)

# Code block for executing the creation and training of the model
if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('--epochs', type=int, default=100)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--input_dim', type=int, default=23)
    parser.add_argument('--hidden_dim', type=int, default=100)
    parser.add_argument('--output_dim', type=int, default=2)

    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING'])

    args = parser.parse_args()
    train(args)

## Cell 8: Run Training Job
This cell begins the training of the model which carries through to completion.

In [None]:
# Run training job
estimator = PyTorch(entry_point='train.py',
                    role=role,
                    instance_count=1,
                    instance_type='ml.m5.large',
                    framework_version='1.8.0',
                    py_version='py3',
                    hyperparameters={
                        'epochs': 100,
                        'lr': 0.01,
                        'input_dim': 23,
                        'hidden_dim': 100,
                        'output_dim': 2
                    })
estimator.fit({'training': training_final_path})

## Cell 9: Deploy Model
This cell handles the deployment of the model allowing future use for predicting from the testing data set.

In [None]:
# Deploy model
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium'
)

## Cell 10: Predict test data set results
This cell classifies the passengers in the dataset according to the model. The print statements show a quick check of the first 15 classifications, comparing the real classifications and those chosen by the model.

In [None]:
# Predict results from test set
input_tensor = torch.tensor(scaled_testing_X).float()
predictions = predictor.predict(input_tensor)
predicted_classes = torch.argmax(torch.tensor(predictions), axis=1)

# Quick accuracy check
print(f"True labels (first 15): {testing_DF_Y.values[:15]}")
print(f"Predicted labels (first 15): {predicted_classes.numpy()[:15]}")

## Cell 11: Comprehensive Accuracy Test
This cell does a more complete test of the model's accuracy, generating a score for the model's overall accuracy as well as displaying the model's precision, recall, f1-score, and support for each unique output. Finally, the cell generates a confusion matrix to visualize the accuracy of the model for each unique output.

In [None]:
# Test exact accuracy
accuracy = accuracy_score(testing_DF_Y, predicted_classes)
print(f"Overall Model Accuracy: {accuracy:.2%}\n")

print("Classification Report:")
target_names = ['Dissatisfied', 'Satisfied']
print(classification_report(testing_DF_Y, predicted_classes, target_names=target_names))

print("Confusion Matrix:")
cm = confusion_matrix(testing_DF_Y, predicted_classes)
plot.figure(figsize=(8, 6))
seaborn.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plot.xlabel('Predicted Label')
plot.ylabel('True Label')
plot.show()

## Cell 12: Delete the Endpoint
This cell deletes the endpoint and concludes the deployment of the model.

In [None]:
# Delete the endpoint
predictor.delete_endpoint()

print("Deleted endpoint")