In [1]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.9.0 torchmetrics-1.2.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchmetrics import Accuracy, AUROC, Precision, Recall, F1Score
from sklearn.model_selection import train_test_split
from collections import OrderedDict

In [3]:
# Put the data in your Google Drive
# You ca get the data here: https://www.kaggle.com/competitions/titanic/data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
X_train = pd.read_csv('/content/drive/MyDrive/TitanicData/train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/TitanicData/test.csv')

In [5]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# Clear up the data a bit and get the title
# If the title is a rate one, put it in a Rare class
dataset_title = [i.split(',')[1].split('.')[0].strip() for i in X_train['Name']]
X_train['Title'] = pd.Series(dataset_title)
X_train['Title'].value_counts()
X_train['Title'] = X_train['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')

dataset_title = [i.split(',')[1].split('.')[0].strip() for i in X_test['Name']]
X_test['Title'] = pd.Series(dataset_title)
X_test['Title'].value_counts()
X_test['Title'] = X_test['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')

X_train['FamilyS'] = X_train['SibSp'] + X_train['Parch'] + 1
X_test['FamilyS'] = X_test['SibSp'] + X_test['Parch'] + 1

In [7]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilyS
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,1


In [8]:
# Get the family size of the data
def family(x):
    if x < 2:
        return 'Single'
    elif x == 2:
        return 'Couple'
    elif x <= 4:
        return 'InterM'
    else:
        return 'Large'

X_train['FamilyS'] = X_train['FamilyS'].apply(family)
X_test['FamilyS'] = X_test['FamilyS'].apply(family)

In [9]:
# Fill missing data with new data
X_train['Embarked'].fillna(X_train['Embarked'].mode()[0], inplace=True)
X_test['Embarked'].fillna(X_test['Embarked'].mode()[0], inplace=True)
X_train['Age'].fillna(X_train['Age'].median(), inplace=True)
X_test['Age'].fillna(X_test['Age'].median(), inplace=True)
X_test['Fare'].fillna(X_test['Fare'].median(), inplace=True)

In [10]:
X_train = X_train.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)
X_test = X_test.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)

In [11]:
X_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilyS
0,0,3,male,22.0,7.25,S,Mr,Couple
1,1,1,female,38.0,71.2833,C,Mrs,Couple
2,1,3,female,26.0,7.925,S,Miss,Single
3,1,1,female,35.0,53.1,S,Mrs,Couple
4,0,3,male,35.0,8.05,S,Mr,Single


In [12]:
# Insert one-hot encoding for categorical variables
X_train = pd.get_dummies(X_train, columns=['Pclass', 'Sex', 'Embarked', 'Title', 'FamilyS'])
X_test = pd.get_dummies(X_test, columns=['Pclass', 'Sex', 'Embarked', 'Title', 'FamilyS'])

In [13]:
# Look at the dummy variables
X_train.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,FamilyS_Couple,FamilyS_InterM,FamilyS_Large,FamilyS_Single
0,0,22.0,7.25,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0
1,1,38.0,71.2833,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0
2,1,26.0,7.925,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0
4,0,35.0,8.05,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1


In [14]:
# Get the data as a NumPy array
X_train, Y_train = X_train.iloc[:, 1:].values, X_train.iloc[:, 0].values
X_test = X_test.values

In [15]:
# Train / Val split
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1)

In [16]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()

        # Note: these layers have a bias term
        self.theta1 = nn.Linear(19, 9)
        self.theta2 = nn.Linear(9, 1)

    def forward(self, x):
        # x = a^{[0]}

        # z^[1] linear function of a^{[0]}
        x = self.theta1(x)

        # a^[1] function of z^{[1]]}
        x = nn.ReLU()(x)

        # z^[2] linear function a^{[1]]}
        x = self.theta2(x)

        # a^[2] is sigmoid of z^[2]
        a = nn.Sigmoid()(x)
        return a

In [17]:
# Need to change the data type here to make the below work
# The dimensions of this data is (N, 19) where N is the total number of rows and 19 is the number of features
X_train = torch.from_numpy(X_train).to(torch.float32)
X_val = torch.from_numpy(X_val).to(torch.float32)
Y_train = torch.from_numpy(Y_train).to(torch.float32)
Y_val = torch.from_numpy(Y_val).to(torch.float32)
X_test = torch.from_numpy(X_test).to(torch.float32)

In [18]:
torch.manual_seed(1)

# Load the model as we did for Logistic Regression
model = MLP()

# An alternative way to specify the model
model = nn.Sequential(OrderedDict([
          ('ff1', nn.Linear(19, 8)),
          ('relu', nn.ReLU()),
          ('ff2', nn.Linear(8, 1)),
          ('sigmoid', nn.Sigmoid())
        ]))

# The loss we use; note this loss is -[y * log(a) + (1-y) * log(1-a)] where a is a probability
# We need to pass in a "a" and "y" tensor for this to works right
loss = nn.BCELoss()

# This is the optimizer we use, with a learning rate that is set to 0.001; this learning rate might be bad or the optimizer should be different
# We use a more agressive learning rate here
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# This is a utility we use, to get batches of data and note that each batch has a certain fixed size
train_dl = DataLoader(TensorDataset(X_train, Y_train), batch_size=16, shuffle=True)

# Here, we have a small metric for AUC
auroc = AUROC(task="binary")

EPOCHS = 100

In [19]:
Y_pred_train = model(X_train)
print(loss(torch.squeeze(Y_pred_train, 1), Y_train))

tensor(0.9772, grad_fn=<BinaryCrossEntropyBackward0>)


In [20]:
# Whereas Logistic Regression got to about 71 % AUC, this model gets to 74 %

# Apply batch Gradient Descent to the data ...
for epoch in range(EPOCHS):
  for i, (x_batch, y_batch) in enumerate(train_dl):
    y_pred_batch = model(x_batch)

    # Note that this loss wants probabilities, not logits
    # This is needed since we need to make a tensor of dimension (16, 1) into (16) to match y_batch's dimension
    # This is the loss over the batch of data; this is an approximation for the loss over the entire Training data
    loss_batch = loss(torch.squeeze(y_pred_batch, 1), y_batch)

    # We need to do this so that the rule theta = theta + alpha * dL/dtheta does not use an old dL/dtheta
    # We want to make sure dL/dtheta is computed just on this batch
    optimizer.zero_grad()

    # Here we get dL/dtheta where we use the loss above and evaluate at the OLD parameter values
    loss_batch.backward()

    # This is theta = theta - alpha * dL/dtheta
    # Here, we get the new parameter values
    optimizer.step()
  # We want to do something here that involves no gradients; this idiom allows us to do exactly that
  # It's like we don't have gradient information: we just do some computations
  with torch.no_grad():
    Y_pred_train = torch.squeeze(model(X_train), 1)
    loss_train = loss(Y_pred_train, Y_train)
    Y_pred_val = torch.squeeze(model(X_val), 1)
    loss_val = loss(Y_pred_val, Y_val)
    if epoch % 10 == 0:
      print('Epoch {}: loss_train {} loss_val {}'.format(
          epoch, loss_train, loss_val
          )
      )
      print('AUC train {} AUC val {}'.format(
          auroc(Y_pred_train, Y_train), auroc(Y_pred_val, Y_val))
      )
      print('\n')

Epoch 0: loss_train 0.6971562504768372 loss_val 0.7438750267028809
AUC train 0.5456255674362183 AUC val 0.4441666603088379


Epoch 10: loss_train 0.6336497664451599 loss_val 0.6640235781669617
AUC train 0.6414647102355957 AUC val 0.5608333945274353


Epoch 20: loss_train 0.6252090334892273 loss_val 0.6618676781654358
AUC train 0.6676825881004333 AUC val 0.5902777910232544


Epoch 30: loss_train 0.620816171169281 loss_val 0.6604453325271606
AUC train 0.6838130950927734 AUC val 0.5980556011199951


Epoch 40: loss_train 0.6310644745826721 loss_val 0.6866475939750671
AUC train 0.7145600318908691 AUC val 0.6208333373069763


Epoch 50: loss_train 0.613675057888031 loss_val 0.6549158692359924
AUC train 0.7094606757164001 AUC val 0.6141666769981384


Epoch 60: loss_train 0.6106309294700623 loss_val 0.6494110822677612
AUC train 0.712711751461029 AUC val 0.6108333468437195


Epoch 70: loss_train 0.6078869700431824 loss_val 0.6438906192779541
AUC train 0.7200461626052856 AUC val 0.613055527210235

Some observations

- Model seems like it is being optimized
- Model seems like AUC is going up

Scratch Commands

In [21]:
# Computing AUCROC
from torch import tensor
from torchmetrics import AUROC
preds = tensor([0.13, 0.26, 0.08, 0.19, 0.34])
target = tensor([0, 0, 1, 1, 1])
auroc = AUROC(task="binary")
auroc(preds, target)

tensor(0.5000)