<a href="https://colab.research.google.com/github/MStamirski/Spaceship-Titanic/blob/main/Model_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Features datasets

In [None]:
from google.colab import drive
drive.mount("/content/drive")
%cd "/content/drive/MyDrive/Colab_Notebooks/SDA_upskill/Spaceship"

In [None]:
!pip install import-ipynb

In [None]:
import import_ipynb

In [None]:
from FeaturesEngineering import get_features, categories_one_hot_encoding, categories_target_encoding, categories_leave_one_out_encoding

In [None]:
df = get_features('train')
df_ohe = categories_one_hot_encoding(df)
df_te = categories_target_encoding(df)
df_looe = categories_leave_one_out_encoding(df)

In [None]:
def get_subsets(dataset):
  X = dataset.drop(columns=['PassengerId', 'Transported'])
  y = dataset['Transported']
  return X, y

# Logistic Regression in PyTorch

In [None]:
import numpy as np
from sklearn.model_selection import KFold
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler

In [None]:
class LogisticRegression(torch.nn.Module):
  def __init__(self, input_dim, output_dim):
    super(LogisticRegression, self).__init__()
    self.linear = torch.nn.Linear(input_dim, output_dim)
  def forward(self, x):
    outputs = torch.sigmoid(self.linear(x))
    return outputs

In [None]:
def train_epoch(model, dataloader, criterion, optimizer):

  model.train()
  loss_val, correct = 0.0, 0

  for Xt, yt in dataloader:
    optimizer.zero_grad()
    output = model(Xt)

    loss = criterion(torch.squeeze(output), torch.squeeze(yt))
    loss.backward()
    optimizer.step()

    loss_val += loss.item()
    predicted = torch.squeeze(output).round().detach().numpy()
    grtruth = torch.squeeze(yt).detach().numpy()
    correct += np.sum((predicted == grtruth))
    
  return loss_val, correct

In [None]:
def valid_epoch(model, dataloader, criterion):
    
  model.eval()
  loss_val, correct = 0.0, 0

  with torch.no_grad():
    for Xt, yt in dataloader:
      output = model(Xt)
      loss = criterion(torch.squeeze(output), torch.squeeze(yt))

      loss_val += loss.item()
      predicted = torch.squeeze(output).round().detach().numpy()
      grtruth = torch.squeeze(yt).detach().numpy()
      correct += np.sum((predicted == grtruth))

  return loss_val, correct

In [None]:
def model_cross_validation(model, X, y, n_splits, n_epochs, learning_rate):

  Xt = torch.tensor(np.array(X), dtype=torch.float32)
  yt = torch.tensor(np.array(y), dtype=torch.float32)
  
  splits = KFold(n_splits = n_splits, shuffle = True, random_state = 42)
  
  history = {'train_loss': [], 'test_loss': [],'train_acc':[],'test_acc':[]}

  for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(yt)))):

    print(f"\nFold {fold+1}")

    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(val_idx)
    train_loader = DataLoader(list(zip(Xt,yt)), sampler=train_sampler)
    test_loader = DataLoader(list(zip(Xt,yt)), sampler=test_sampler)
    
    fold_model = model
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(n_epochs):
      train_loss, train_correct = train_epoch(fold_model, train_loader, criterion, optimizer)
      test_loss, test_correct = valid_epoch(fold_model, test_loader, criterion)

      train_loss = round(train_loss / len(train_loader.sampler), 4)
      train_acc = round(train_correct / len(train_loader.sampler) * 100, 4)
      test_loss = round(test_loss / len(test_loader.sampler), 4)
      test_acc = round(test_correct / len(test_loader.sampler) * 100, 4)

      print(f"Epoch:{epoch+1}/{n_epochs} AVG Training Loss:{train_loss} AVG Test Loss:{test_loss} AVG Training Acc {train_acc}% AVG Test Acc {test_acc}%")

    # scores for last epoch
    history['train_loss'].append(train_loss)
    history['test_loss'].append(test_loss)
    history['train_acc'].append(train_acc)
    history['test_acc'].append(test_acc)
      
  # mean scores for all folds
  avg_train_loss = round(np.mean(history['train_loss']), 4)
  avg_test_loss = round(np.mean(history['test_loss']), 4)
  avg_train_acc = round(np.mean(history['train_acc']), 4)
  avg_test_acc = round(np.mean(history['test_acc']), 4)

  print(f"\nPerformance of {n_splits} fold cross validation")
  print(f"Average Training Loss: {avg_train_loss} \t Average Test Loss: {avg_test_loss} \t Average Training Acc: {avg_train_acc} \t Average Test Acc: {avg_test_acc}")

  return avg_test_acc

# One hot encoding

In [None]:
X, y = get_subsets(df_ohe)
input_dim = X.shape[1]
output_dim = 1
model = LogisticRegression(input_dim, output_dim)

In [None]:
acc_ohe = model_cross_validation(model, X, y, n_splits=10, n_epochs=10, learning_rate=0.01)


Fold 1
Epoch:1/10 AVG Training Loss:0.4972 AVG Test Loss:0.4907 AVG Training Acc 76.3262% AVG Test Acc 76.2069%
Epoch:2/10 AVG Training Loss:0.4585 AVG Test Loss:0.4835 AVG Training Acc 78.8189% AVG Test Acc 77.0115%
Epoch:3/10 AVG Training Loss:0.452 AVG Test Loss:0.4887 AVG Training Acc 78.8572% AVG Test Acc 76.3218%
Epoch:4/10 AVG Training Loss:0.4492 AVG Test Loss:0.4831 AVG Training Acc 79.1768% AVG Test Acc 76.5517%
Epoch:5/10 AVG Training Loss:0.4473 AVG Test Loss:0.4855 AVG Training Acc 78.985% AVG Test Acc 76.8966%
Epoch:6/10 AVG Training Loss:0.4473 AVG Test Loss:0.4811 AVG Training Acc 78.8956% AVG Test Acc 76.6667%
Epoch:7/10 AVG Training Loss:0.4462 AVG Test Loss:0.484 AVG Training Acc 78.9339% AVG Test Acc 76.5517%
Epoch:8/10 AVG Training Loss:0.4468 AVG Test Loss:0.4831 AVG Training Acc 78.8317% AVG Test Acc 76.5517%
Epoch:9/10 AVG Training Loss:0.447 AVG Test Loss:0.4813 AVG Training Acc 79.0106% AVG Test Acc 76.7816%
Epoch:10/10 AVG Training Loss:0.4467 AVG Test Loss:

# Target encoding

In [None]:
X, y = get_subsets(df_te)
input_dim = X.shape[1]
output_dim = 1
model = LogisticRegression(input_dim, output_dim)

In [None]:
acc_te = model_cross_validation(model, X, y, n_splits=10, n_epochs=10, learning_rate=0.01)


Fold 1
Epoch:1/10 AVG Training Loss:0.6233 AVG Test Loss:0.5902 AVG Training Acc 68.4264% AVG Test Acc 70.8046%
Epoch:2/10 AVG Training Loss:0.5625 AVG Test Loss:0.5622 AVG Training Acc 73.9742% AVG Test Acc 72.6437%
Epoch:3/10 AVG Training Loss:0.5492 AVG Test Loss:0.5579 AVG Training Acc 73.6418% AVG Test Acc 75.0575%
Epoch:4/10 AVG Training Loss:0.5431 AVG Test Loss:0.5505 AVG Training Acc 74.2682% AVG Test Acc 72.2989%
Epoch:5/10 AVG Training Loss:0.5379 AVG Test Loss:0.5453 AVG Training Acc 74.4088% AVG Test Acc 73.908%
Epoch:6/10 AVG Training Loss:0.5339 AVG Test Loss:0.5479 AVG Training Acc 74.8178% AVG Test Acc 75.1724%
Epoch:7/10 AVG Training Loss:0.5306 AVG Test Loss:0.5395 AVG Training Acc 74.7795% AVG Test Acc 73.7931%
Epoch:8/10 AVG Training Loss:0.5279 AVG Test Loss:0.5435 AVG Training Acc 74.6261% AVG Test Acc 75.1724%
Epoch:9/10 AVG Training Loss:0.5258 AVG Test Loss:0.5414 AVG Training Acc 75.1246% AVG Test Acc 75.1724%
Epoch:10/10 AVG Training Loss:0.5239 AVG Test Lo

# Leave one out encoding

In [None]:
X, y = get_subsets(df_looe)
input_dim = X.shape[1]
output_dim = 1
model = LogisticRegression(input_dim, output_dim)

In [None]:
acc_looe = model_cross_validation(model, X, y, n_splits=10, n_epochs=10, learning_rate=0.01)


Fold 1
Epoch:1/10 AVG Training Loss:0.6162 AVG Test Loss:0.5772 AVG Training Acc 69.4746% AVG Test Acc 72.5287%
Epoch:2/10 AVG Training Loss:0.5588 AVG Test Loss:0.5613 AVG Training Acc 73.9358% AVG Test Acc 74.8276%
Epoch:3/10 AVG Training Loss:0.5469 AVG Test Loss:0.5536 AVG Training Acc 74.1404% AVG Test Acc 74.9425%
Epoch:4/10 AVG Training Loss:0.5413 AVG Test Loss:0.5578 AVG Training Acc 74.2937% AVG Test Acc 75.1724%
Epoch:5/10 AVG Training Loss:0.5365 AVG Test Loss:0.5579 AVG Training Acc 74.6517% AVG Test Acc 75.5172%
Epoch:6/10 AVG Training Loss:0.534 AVG Test Loss:0.5497 AVG Training Acc 74.5622% AVG Test Acc 75.0575%
Epoch:7/10 AVG Training Loss:0.5306 AVG Test Loss:0.5425 AVG Training Acc 74.7795% AVG Test Acc 75.2874%
Epoch:8/10 AVG Training Loss:0.529 AVG Test Loss:0.5389 AVG Training Acc 74.984% AVG Test Acc 74.5977%
Epoch:9/10 AVG Training Loss:0.527 AVG Test Loss:0.5365 AVG Training Acc 75.1758% AVG Test Acc 74.5977%
Epoch:10/10 AVG Training Loss:0.5256 AVG Test Loss:

# Save results

In [None]:
data = []
data.append(['PyTorch LogisticRegression', 'OHE', acc_ohe, 0, "---"])
data.append(['PyTorch LogisticRegression', 'TE', acc_te, 0, "---"])
data.append(['Pytorch LogisticRegression', 'LOOE', acc_looe, 0, "---"])

In [None]:
import csv
from os.path import exists
resfile = 'spaceship_results.csv'

In [None]:
if exists(resfile):
  f = open(resfile, 'a')
  writer = csv.writer(f)
else:
  header = ['Model', 'Categories_encoding', 'Initial_accuracy', 'Tuned_Accuracy', 'Important_Features']
  f = open(resfile, 'w', newline='')
  writer = csv.writer(f)
  writer.writerow(header)

writer.writerows(data)
f.close()