<a href="https://colab.research.google.com/github/HansDampf37/Kaggle/blob/main/Kaggle_Titanic_Survival_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [516]:
# !pip install shap
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Load and Prepare datasets
- We factorize features to numbers
- I suppose that Cabins with the same starting number are closer together -> we split *cabin* into the features *cabinNumber* an *cabinLetter*


In [517]:
labeled = pd.read_csv('data/train.csv')
unlabeled = pd.read_csv('data/test.csv')
labeled.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [518]:
def preprocess_data(data):
  data['CabinLetter'] = data['Cabin'].str.extract(r'([A-Za-z])', expand=False)
  data['CabinLetter'] = pd.factorize(data['CabinLetter'])[0]
  data['CabinNumber'] = data['Cabin'].str.extract(r'(\d+)', expand=False)
  data['CabinNumber'] = pd.to_numeric(data['CabinNumber'], errors='coerce')
  data['Sex'] = pd.factorize(data['Sex'])[0]
  # data['Ticket'] = pd.factorize(data['Ticket'])[0]
  data['Embarked'] = pd.factorize(data['Embarked'])[0]
  data.fillna(-1, inplace=True)
  data.drop(columns=['Name', 'Cabin', 'Ticket', 'PassengerId'], inplace=True)


y = labeled['Survived']
X = labeled.drop(columns=["Survived"])
preprocess_data(X)
preprocess_data(unlabeled)

X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,CabinNumber
0,3,0,22.0,1,0,7.25,0,-1,-1.0
1,1,1,38.0,1,0,71.2833,1,0,85.0
2,3,1,26.0,0,0,7.925,0,-1,-1.0
3,1,1,35.0,1,0,53.1,0,0,123.0
4,3,0,35.0,0,0,8.05,0,-1,-1.0


# Train a simple SVM to compare against

In [519]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(f"Train size: {X_train.shape[0]}")
print(f"Test size: {X_test.shape[0]}")


svc = SVC(probability=True, random_state=42)
svc.fit(X_train, y_train)
print(f'SVC Training Accuracy: {svc.score(X_train, y_train)}')
print(f'SVC Test Accuracy: {svc.score(X_test, y_test)}')

rfc = RandomForestClassifier(random_state=42, n_estimators=100)
rfc.fit(X_train, y_train)
print(f'RFC Training Accuracy: {rfc.score(X_train, y_train)}')
print(f'RFC Test Accuracy: {rfc.score(X_test, y_test)}')

Train size: 801
Test size: 90
SVC Training Accuracy: 0.6928838951310862
SVC Test Accuracy: 0.7444444444444445
RFC Training Accuracy: 0.9850187265917603
RFC Test Accuracy: 0.8111111111111111


# Implement a model

Implement a classifier model

In [520]:
class Classifier(nn.Module):
  def __init__(self, input_size, output_size):
    super(Classifier, self).__init__()
    self.fc1 = nn.Linear(input_size, 16)
    self.fc2 = nn.Linear(16, 16)
    self.fc3 = nn.Linear(16, 16)
    self.fc4 = nn.Linear(16, output_size)
    self.relu = nn.ReLU()
    self.optimizer = optim.Adam(self.parameters(), lr=0.001)
    self.loss_fn = nn.CrossEntropyLoss()

  def forward(self, x):
    x = self.relu(self.fc1(x))
    x = self.relu(self.fc2(x))
    x = self.relu(self.fc3(x))
    x = self.fc4(x)
    return x

model = Classifier(X_train.shape[1], 2) # Assuming binary classification (Survived or not)
summary(model, (1, X_train.shape[1])) # Adjust input shape for summary

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 1, 16]             160
              ReLU-2                [-1, 1, 16]               0
            Linear-3                [-1, 1, 16]             272
              ReLU-4                [-1, 1, 16]               0
            Linear-5                [-1, 1, 16]             272
              ReLU-6                [-1, 1, 16]               0
            Linear-7                 [-1, 1, 2]              34
Total params: 738
Trainable params: 738
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


# Train it

In [None]:
# Convert Pandas Series to PyTorch tensors
train_X_tensor = torch.tensor(X_train.values.astype(np.float32), dtype=torch.float32)
train_y_tensor = torch.tensor(y_train.values, dtype=torch.long)
train_dataset = TensorDataset(train_X_tensor, train_y_tensor)
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)

valid_X_tensor = torch.tensor(X_test.values.astype(np.float32), dtype=torch.float32)
valid_y_tensor = torch.tensor(y_test.values, dtype=torch.long)
valid_dataset = TensorDataset(valid_X_tensor, valid_y_tensor)
valid_loader = DataLoader(valid_dataset, batch_size=10, shuffle=False)

def evaluate_test_accuracy(model, test_loader):
    model.eval() # Set model to evaluation mode
    with torch.no_grad(): # Disable gradient calculation for test
        val_correct = 0
        val_samples = 0
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == batch_y).sum().item()
            val_samples += batch_y.size(0)

        val_accuracy = 100 * val_correct / val_samples
    model.train() # Set model back to training mode
    return val_accuracy

def train(model, train_loader, test_loader, num_epochs = 400):
  model.train()
  training_accuracies = []
  test_accuracies = []
  for epoch in range(num_epochs):
      total_correct = 0
      total_samples = 0
      for batch_X, batch_y in train_loader:
          # Forward pass
          outputs = model(batch_X)
          loss = model.loss_fn(outputs, batch_y)

          # Backward pass and optimization
          model.optimizer.zero_grad()
          loss.backward()
          model.optimizer.step()

          # Accuracy calculation
          _, predicted = torch.max(outputs, 1)  # Get predicted class indices
          total_correct += (predicted == batch_y).sum().item()
          total_samples += batch_y.size(0)

      accuracy = 100 * total_correct / total_samples
      test_accuracy = evaluate_test_accuracy(model, test_loader)
      training_accuracies.append(accuracy)
      test_accuracies.append(test_accuracy)

      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}')
  return training_accuracies, test_accuracies


def plot(training_accuracies, test_accuracies):
  plt.figure(figsize=(10, 6))
  plt.plot(training_accuracies, label='Training Accuracy')
  plt.plot(test_accuracies, label='Validation Accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.show()

training_accuracies, test_accuracies = train(model, train_loader, valid_loader)
plot(training_accuracies, test_accuracies)

Epoch [1/400], Loss: 0.4558, Accuracy: 59.43%, Test Accuracy: 71.11
Epoch [2/400], Loss: 0.4212, Accuracy: 67.54%, Test Accuracy: 71.11
Epoch [3/400], Loss: 0.5294, Accuracy: 66.42%, Test Accuracy: 72.22
Epoch [4/400], Loss: 0.8351, Accuracy: 67.42%, Test Accuracy: 71.11
Epoch [5/400], Loss: 0.5825, Accuracy: 69.41%, Test Accuracy: 71.11
Epoch [6/400], Loss: 0.9083, Accuracy: 68.91%, Test Accuracy: 72.22
Epoch [7/400], Loss: 0.9790, Accuracy: 68.79%, Test Accuracy: 72.22
Epoch [8/400], Loss: 0.7557, Accuracy: 69.41%, Test Accuracy: 76.67
Epoch [9/400], Loss: 0.7179, Accuracy: 68.79%, Test Accuracy: 74.44
Epoch [10/400], Loss: 0.6900, Accuracy: 70.79%, Test Accuracy: 74.44
Epoch [11/400], Loss: 0.5599, Accuracy: 70.41%, Test Accuracy: 73.33
Epoch [12/400], Loss: 0.4003, Accuracy: 70.16%, Test Accuracy: 74.44
Epoch [13/400], Loss: 0.5751, Accuracy: 70.41%, Test Accuracy: 76.67
Epoch [14/400], Loss: 0.2093, Accuracy: 71.91%, Test Accuracy: 76.67
Epoch [15/400], Loss: 0.3782, Accuracy: 71.

# Explain predictions with SHAP
As my model requires tensors as input and SHAP requires np arrays, I introduced a model_wrapper, that transforms incoming data to a tensor.

In [None]:
def explain(model, data):
  def model_wrapper(data):
      # Transform data to tensor
      data_tensor = torch.tensor(data, dtype=torch.float32)
      with torch.no_grad():
          # Run the model and convert the output back to NumPy array
          return model(data_tensor).numpy()

  explainer = shap.KernelExplainer(model_wrapper, data)
  return explainer(X_test)

background_indices = np.random.choice(X_train.shape[0], 50, replace=False)
background = X_train.iloc[background_indices]
shap_values = explain(model, background)

In [None]:
def plot_shap(shap_values):
  plt.title("Feature Importance for dead people")
  shap.plots.bar(shap_values[:, :, 0])
  plt.title("Feature Importance for survivors")
  shap.plots.bar(shap_values[:, :, 1])
  shap.plots.waterfall(shap_values[2, :, 1])

plot_shap(shap_values)

# Ensemble
Try to get better predictions by aggregating predictions about relationships, position, and the person itself.

In [None]:
relationship_columns = ['SibSp', 'Parch', 'Age', 'Sex']
personal_columns = ['Age', 'Sex', 'Pclass', 'Fare']
location_columns = ['Pclass', 'CabinNumber', 'CabinLetter', 'Embarked']

def split(data):
  X_train_relationships = data[relationship_columns]
  X_train_personal = data[personal_columns]
  X_train_loc = data[location_columns]
  relationship_columns_indices = [data.columns.get_loc(col) for col in relationship_columns]
  personal_columns_indices = [data.columns.get_loc(col) for col in personal_columns]
  location_columns_indices = [data.columns.get_loc(col) for col in location_columns]

  return X_train_relationships, X_train_personal, X_train_loc, relationship_columns_indices, personal_columns_indices, location_columns_indices

def get_models():
  X_train_relationships, X_train_personal, X_train_loc, relationship_columns_indices, personal_columns_indices, location_columns_indices = split(X_train)
  X_test_relationships, X_test_personal, X_test_loc, _, _, _ = split(X_test)

  relationship_model = Classifier(X_train_relationships.shape[1], 2)
  personal_model = Classifier(X_train_personal.shape[1], 2)
  loc_model = Classifier(X_train_loc.shape[1], 2)

  relationship_dataset = TensorDataset(torch.tensor(X_train_relationships.values.astype(np.float32), dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.long))
  relationship_loader = DataLoader(relationship_dataset, batch_size=10, shuffle=True)
  relationship_test_dataset = TensorDataset(torch.tensor(X_test_relationships.values.astype(np.float32), dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.long))
  relationship_test_loader = DataLoader(relationship_test_dataset, batch_size=10, shuffle=False)
  personal_dataset = TensorDataset(torch.tensor(X_train_personal.values.astype(np.float32), dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.long))
  personal_loader = DataLoader(personal_dataset, batch_size=10, shuffle=True)
  personal_test_dataset = TensorDataset(torch.tensor(X_test_personal.values.astype(np.float32), dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.long))
  personal_test_loader = DataLoader(personal_test_dataset, batch_size=10, shuffle=False)
  loc_dataset = TensorDataset(torch.tensor(X_train_loc.values.astype(np.float32), dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.long))
  loc_loader = DataLoader(loc_dataset, batch_size=10, shuffle=True)
  loc_test_dataset = TensorDataset(torch.tensor(X_test_loc.values.astype(np.float32), dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.long))
  loc_test_loader = DataLoader(loc_test_dataset, batch_size=10, shuffle=False)

  a_rel, b_rel = train(relationship_model, relationship_loader, relationship_test_loader, 100)
  a_pers, b_pers = train(personal_model, personal_loader, personal_test_loader, 200)
  a_loc, b_loc = train(loc_model, loc_loader, loc_test_loader, 100)
  plot(a_rel, b_rel)
  plot(a_pers, b_pers)
  plot(a_loc, b_loc)

  return relationship_model, personal_model, loc_model, relationship_columns_indices, personal_columns_indices, location_columns_indices

relationship_model, personal_model, loc_model, relationship_columns_indices, personal_columns_indices, location_columns_indices = get_models()

In [None]:
class EnsembleModel(nn.Module):
  def __init__(self, loc_model, relationship_model, personal_model):
    super(EnsembleModel, self).__init__()
    self.loc_model = loc_model
    self.relationship_model = relationship_model
    self.personal_model = personal_model
    self.x = nn.Parameter(torch.randn(1, 2))
    self.y = nn.Parameter(torch.randn(1, 2))
    self.z = nn.Parameter(torch.randn(1, 2))
    self.loss_fn = nn.CrossEntropyLoss()
    self.optimizer = optim.Adam(self.parameters(), lr=0.001)

  def forward(self, x):
    loc_output = self.x * self.loc_model(x[:, location_columns_indices])
    relationship_output = self.y * self.relationship_model(x[:, relationship_columns_indices])
    personal_output = self.z * self.personal_model(x[:, personal_columns_indices])
    return (loc_output + relationship_output + personal_output) / (self.x + self.y + self.z)

ensemble_model = EnsembleModel(loc_model, relationship_model, personal_model)
a, b = train(ensemble_model, train_loader, valid_loader, 100)
plot(a, b)

In [None]:
print(f"FF Model: {evaluate_test_accuracy(model, valid_loader)}")
print(f"Ensemble: {evaluate_test_accuracy(ensemble_model, valid_loader)}")

# Revisit Data for additional features
I joined the following features into the data:
- Title: Mrs, Mr, Dr, ...
- Ticket Number & Prefix
- Cabin Number & Letter

The standard classifier does not perform well under these conditions as the causaility between the new features is too sparse.

So I focused on examining the Ensembles performance with these new features.

In [None]:
labeled = pd.read_csv('data/train.csv')
unlabeled = pd.read_csv('data/test.csv')
labeled.head()

In [None]:
def preprocess_data_feature_engineering(data):
  data['CabinLetter'] = data['Cabin'].str.extract(r'([A-Za-z])', expand=False)
  data['CabinNumber'] = data['Cabin'].str.extract(r'(\d+)', expand=False)
  data['Title'] = data['Name'].apply(extract_title)
  data['TicketPrefix'] = data['Ticket'].apply(lambda x: x.split()[0] if len(x.split()) > 1 else None)
  data['TicketNumber'] = data['Ticket'].apply(lambda x: x.split()[-1])

  data['CabinLetter'] = pd.factorize(data['CabinLetter'])[0]
  data['CabinNumber'] = pd.to_numeric(data['CabinNumber'], errors='coerce')
  data['Sex'] = pd.factorize(data['Sex'])[0]
  data['Title'] = pd.factorize(data['Title'])[0]
  data['TicketPrefix'] = pd.factorize(data['TicketPrefix'])[0]
  data['TicketNumber'] = pd.to_numeric(data['TicketNumber'], errors='coerce')
  data['Embarked'] = pd.factorize(data['Embarked'])[0]
  data.fillna(-1, inplace=True)
  data.drop(columns=['Name', 'Cabin', 'Ticket', 'PassengerId'], inplace=True)

def extract_title(name):
    title = name.split(',')[1].split('.')[0].strip()
    return title

y = labeled['Survived']
X = labeled.drop(columns=["Survived"])
preprocess_data_feature_engineering(X)
preprocess_data_feature_engineering(unlabeled)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(f"Train size: {X_train.shape[0]}")
print(f"Test size: {X_test.shape[0]}")

train_X_tensor_new = torch.tensor(X_train.values.astype(np.float32), dtype=torch.float32)
train_y_tensor_new = torch.tensor(y_train.values, dtype=torch.long)
train_dataset_new = TensorDataset(train_X_tensor_new, train_y_tensor_new)
train_loader_new = DataLoader(train_dataset_new, batch_size=10, shuffle=True)

valid_X_tensor_new = torch.tensor(X_test.values.astype(np.float32), dtype=torch.float32)
valid_y_tensor_new = torch.tensor(y_test.values, dtype=torch.long)
valid_dataset_new = TensorDataset(valid_X_tensor_new, valid_y_tensor_new)
valid_loader_new = DataLoader(valid_dataset_new, batch_size=10, shuffle=False)

X.head()

In [None]:
relationship_columns = ['SibSp', 'Parch', 'Age', 'Sex', 'Title']
personal_columns = ['Age', 'Sex', 'Pclass', 'Fare', 'Title']
location_columns = ['Pclass', 'CabinNumber', 'CabinLetter', 'Embarked', 'TicketPrefix', 'TicketNumber']
relationship_model, personal_model, loc_model, relationship_columns_indices, personal_columns_indices, location_columns_indices = get_models()
ensemble_model_new = EnsembleModel(loc_model, relationship_model, personal_model)
a, b = train(ensemble_model_new, train_loader_new, valid_loader_new, 100)
plot(a, b)

In [None]:
print(f"FF Model: {evaluate_test_accuracy(model, valid_loader)}")
print(f"New Ensemble: {evaluate_test_accuracy(ensemble_model_new, valid_loader_new)}")
print(f"Old Ensemble: {evaluate_test_accuracy(ensemble_model, valid_loader)}")