In [None]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torcheval.metrics import BinaryAccuracy

In [None]:
# Read the data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [None]:
train_df

We need to check the data

In [None]:
# First glimpse of the data
print(train_df.head())

In [None]:
# Check for columns with numerical values
print(train_df.describe())

In [None]:
# Check for NaN, nan, inf values
test_df.isna().sum()

In [None]:
# Print how many different entries we have per category of non-numerical features
categories = ['PassengerId', 'CryoSleep', 'Cabin', 'HomePlanet', 'Destination', 'Name', 'Transported']
for category in categories:
    a = train_df[category].unique()
    print(category, a)
    print(len(a))

In [None]:
# Split the 'Cabin' category into its components
train_df[['deck', 'num', 'side']] = train_df['Cabin'].str.split('/',expand=True)
train_df = train_df.drop(['Cabin'], axis=1)

test_df[['deck', 'num', 'side']] = test_df['Cabin'].str.split('/',expand=True)
test_df = test_df.drop(['Cabin'], axis=1)

In [None]:
# Function to complete missing values with adequate attributes
def fill_missing_values(df):
    df['HomePlanet'] = df['HomePlanet'].fillna('Earth')
    df['CryoSleep'] = df['CryoSleep'].astype(bool).fillna(False)
    df['deck'] = df['deck'].fillna('Unknown')
    df['num'] = df['num'].fillna('Unknown')
    df['side'] = df['side'].fillna('Unknown')
    df['Destination'] = df['Destination'].fillna('TRAPPIST-1e')
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['VIP'] = df['VIP'].astype(bool).fillna(False)
    df = df.fillna(0)
    return df

In [None]:
# Apply missing values filling function on both train and test dataframes
train_df = fill_missing_values(train_df)
test_df = fill_missing_values(test_df)

Neural nets can only process numerical data, but there are several columns that contain categorical data.

We will create a mapping between categorical data and its index in a sorted array composed of unique values from the respective category.

In [None]:
# Generate encoders for each non-numerical category
# We need to keep the same mapping for both train and test dataframes
# Therefore, we first merge the non-numerical categories together and only after generate the corresponding LabelEncoder functions
label_cols = ['HomePlanet', 'CryoSleep', 'deck', 'num', 'side', 'Destination', 'VIP']
combined_data = pd.concat([train_df[label_cols], test_df[label_cols]], axis=0)
label_encoders = {col: LabelEncoder().fit(combined_data[col]) for col in label_cols}

In [None]:
# Apply the label encoder functions on both train and test df
for col, le in label_encoders.items():
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [None]:
train_df

In [None]:
# Remove unnecessary data (Name and PassengerId) from the train dataset
# We also need to remove the labels, since these are the values that we want to predict.
X_train = train_df.drop(['PassengerId', 'Name', 'Transported'], axis=1)
# Create a target/label dataframe consisting of only the Transported column
Y_train = train_df['Transported'].astype(int)
# Apply same reasoning for the test dataset
# Note: the test dataset does not contain labels, since these are exactly the ones that we will try to predict
x_test = test_df.drop(['PassengerId', 'Name'], axis=1).to_numpy()

In [None]:
# Split the training data into train and val subsets that we will use to optimize our model
x_train, x_val, y_train, y_val = train_test_split(X_train.to_numpy(), Y_train.to_numpy(), test_size=0.2, random_state=42)

In [None]:
# We will design a 3-layer neural net - we need to determine the dimensions of the input, hidden and output layers
input_size = x_train.shape[1] # input layer is directly linked to the dimension of the data
hidden_size = 5 # hidden layer size is arbitrary - we can customize it as we want
output_size = 1 # output layer size is directly linked to the number of classes we try to predict; True/False means that we can use a single output value

In [None]:
# Generate the neural net model
model = torch.nn.Sequential(
    torch.nn.Linear(input_size, hidden_size),
    torch.nn.Sigmoid(),
    torch.nn.Linear(hidden_size, output_size),
    torch.nn.Sigmoid()
)

In [None]:
# Setup loss function, metric and optimizer
loss_func = torch.nn.BCELoss() #Choosing binary cross-entropy as loss metric
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
metric = BinaryAccuracy()

In [None]:
# Actual training loop
for itr in range(5000):
    # Set the model in training mode
    model.train()
    # Forward propagate the training input through the entire netwrok
    out = model(torch.from_numpy(x_train).type(torch.FloatTensor)).squeeze()
    # Compute loss function
    loss = loss_func(out, torch.tensor(y_train, dtype=torch.float32))
    # Reset previously memorized gradients (by default, they accumulate for every iteration)
    optimizer.zero_grad()
    # Compute the gradients
    loss.backward()
    # Perform optimization step - update the weights according to the computed gradients
    optimizer.step()

    # Once every 100 iterations run a testing step to see how our model performs
    if itr % 100 == 0:
        # Set the model in testing mode
        model.eval()
        # Forward propagate the testing input the entire netwrok
        out = model(torch.from_numpy(x_val).type(torch.FloatTensor)).squeeze()
        # Compute the loss and metric just for information
        loss = loss_func(out, torch.tensor(y_val, dtype=torch.float32))
        metric.update(out, torch.tensor(y_val))
        print(metric.compute())
        print(loss)


In [None]:
metric.compute()