### Imports

In [1]:
import pandas as pd
import numpy as np
from typing import List


# Sci-kit learn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split

# PyTorch
import torch
import torch.nn as nn
import torch.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

  from .autonotebook import tqdm as notebook_tqdm


### Load in Data

In [2]:
df_train = pd.read_csv('../data/spaceship_titanic/train.csv')

In [3]:
# Remove the predicted label
training_labels = df_train.pop('Transported')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
dtypes: float64(6), object(7)
memory usage: 883.0+ KB


### Drop Columns

In [5]:
Columns_List = List[str]
def drop_columns(df: pd.DataFrame, cols: Columns_List):
    df.drop(cols, axis = 1, inplace = True)
    return df

df_train = drop_columns(df_train, ["Name", "Cabin"])

### Identify column types

In [6]:
cat_cols = df_train.select_dtypes(include=['object']).columns.to_list()
cont_cols = df_train.select_dtypes(include=['float64']).columns.to_list()

# Exclude passenger ID as categorical column
cat_cols.pop(cat_cols.index('PassengerId'))

print(f"Categorical columns: {cat_cols}")
print(f"Continuous columns: {cont_cols}")

Categorical columns: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
Continuous columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


### Encode categorical variables

In [7]:
ord_encoder = OrdinalEncoder()
df_train[cat_cols] = ord_encoder.fit_transform(df_train[cat_cols])

### Feature Engineering

### Impute Missing Values

In [8]:
# Continuous variables

iterative_imputer = IterativeImputer()

df_train[cont_cols] = pd.DataFrame(iterative_imputer.fit_transform(df_train[cont_cols]), columns = cont_cols)

In [9]:
# Categorical variables
categorical_imputer = SimpleImputer(strategy='most_frequent')
df_train[cat_cols] = pd.DataFrame(categorical_imputer.fit_transform(df_train[cat_cols]), columns = cat_cols)


In [10]:
# Sanity check on any remaining missing values
df_train.isnull().sum().sum()

0

#### Get group number

In [11]:
# Group number is in passenger id (first half)
df_train['group'] = df_train['PassengerId'].str.split('_').str[0]
df_train['group'] = pd.to_numeric(df_train['group'])

In [12]:
# Sanity check
df_train['group'][:10]

0    1
1    2
2    3
3    3
4    4
5    5
6    6
7    6
8    7
9    8
Name: group, dtype: int64

In [13]:
# Remove passengerId as its not needed anymore
df_train.drop('PassengerId', axis = 1, inplace=True)

#### Noramlize values

In [14]:
std_scaler = StandardScaler()
normalized_cols = [col + '_norm' for col in cont_cols]
df_train[normalized_cols] = std_scaler.fit_transform(df_train[cont_cols])

In [15]:
# Sanity check on normalized columns
df_train[normalized_cols].head()

Unnamed: 0,Age_norm,RoomService_norm,FoodCourt_norm,ShoppingMall_norm,Spa_norm,VRDeck_norm
0,0.709373,-0.34042,-0.286919,-0.290836,-0.276256,-0.26814
1,-0.336374,-0.17521,-0.281279,-0.248989,0.21162,-0.229322
2,2.033985,-0.275245,1.954387,-0.290836,5.691115,-0.224911
3,0.291074,-0.34042,0.517218,0.330181,2.682103,-0.097871
4,-0.894105,0.118835,-0.243046,-0.038077,0.225839,-0.266375


#### Compute Feature Importance

In [16]:
df_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,group,Age_norm,RoomService_norm,FoodCourt_norm,ShoppingMall_norm,Spa_norm,VRDeck_norm
0,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.709373,-0.34042,-0.286919,-0.290836,-0.276256,-0.26814
1,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,2,-0.336374,-0.17521,-0.281279,-0.248989,0.21162,-0.229322
2,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,3,2.033985,-0.275245,1.954387,-0.290836,5.691115,-0.224911
3,1.0,0.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,3,0.291074,-0.34042,0.517218,0.330181,2.682103,-0.097871
4,0.0,0.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,4,-0.894105,0.118835,-0.243046,-0.038077,0.225839,-0.266375


In [17]:
mi_scores = mutual_info_classif(df_train, training_labels)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=df_train.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

CryoSleep            0.102184
RoomService_norm     0.077112
Spa_norm             0.074521
Spa                  0.074032
RoomService          0.069947
VRDeck_norm          0.069318
VRDeck               0.063898
ShoppingMall         0.056423
ShoppingMall_norm    0.050805
FoodCourt_norm       0.044293
FoodCourt            0.043170
HomePlanet           0.021183
group                0.019579
Age_norm             0.017392
Age                  0.014423
Destination          0.005573
VIP                  0.003340
Name: MI Scores, dtype: float64

In [18]:
# Drop Destination and VIP
df_train.drop(['Destination', 'VIP'], axis = 1, inplace = True)

#### Save the data after preprocessing

In [19]:
len(training_labels)

8693

In [20]:
df_train = pd.concat([df_train, training_labels], axis=1)
df_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,group,Age_norm,RoomService_norm,FoodCourt_norm,ShoppingMall_norm,Spa_norm,VRDeck_norm,Transported
0,1.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,1,0.709373,-0.34042,-0.286919,-0.290836,-0.276256,-0.26814,False
1,0.0,0.0,24.0,109.0,9.0,25.0,549.0,44.0,2,-0.336374,-0.17521,-0.281279,-0.248989,0.21162,-0.229322,True
2,1.0,0.0,58.0,43.0,3576.0,0.0,6715.0,49.0,3,2.033985,-0.275245,1.954387,-0.290836,5.691115,-0.224911,False
3,1.0,0.0,33.0,0.0,1283.0,371.0,3329.0,193.0,3,0.291074,-0.34042,0.517218,0.330181,2.682103,-0.097871,False
4,0.0,0.0,16.0,303.0,70.0,151.0,565.0,2.0,4,-0.894105,0.118835,-0.243046,-0.038077,0.225839,-0.266375,True


#### Save preprocessed data as checkpoint

In [21]:
df_train.to_csv('../data/spaceship_titanic/preprocessed_train.csv', index=False)

#### Split Data

In [22]:
# X_train, X_val, y_train, y_val = train_test_split(df_train, training_labels, test_size=0.2)

In [23]:
class SSTitanic(Dataset):
    def __init__(self, csv_file_path: str):
        
        df = pd.read_csv(csv_file_path)
        
        # Note that the data has been preprocessed e.g. Normalized and encoding done
        target_column = ['Transported']
        
        # Convert to tensor
        self.X = df.drop(target_column, axis = 1).values
        self.X = torch.tensor(self.X, dtype=torch.float32)
        
        # Convert to tensor
        self.y = df[target_column].values
        self.y = torch.tensor(self.y, dtype=torch.bool)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    

In [24]:
sst = SSTitanic('../data/spaceship_titanic/preprocessed_train.csv')

#### Model

In [25]:
# Define the model

class Net(nn.Module):
    
    def __init__(self, d_in, h = 15, d_out = 1):
        super().__init__()
        
        self.fc1 = nn.Linear(d_in, h)
        self.fc2 = nn.Linear(h, d_out)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        
        return x.squeeze()

#### Training Loop

In [30]:
def train(csv_file, n_epochs = 100):
    """Train the model

    Args:
        csv_file (_type_): _description_
        epochs (int, optional): _description_. Defaults to 100.
    """
    
    # Load the dataset
    dataset = SSTitanic(csv_file)
    
    # Split into train and val
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    trainset, valset = random_split(dataset, [train_size, val_size])
    print(trainset)
    
    # DataLoaders
    trainloader = DataLoader(trainset, batch_size = 32, shuffle = True)
    valloader = DataLoader(valset, batch_size = 32, shuffle = False)
    
    # Set device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Device is: {device}")
    
    # Set the model settings
    D_in, H = 15, 32
    net = Net(D_in, H).to(device)
    
    # Loss function
    criterion = nn.MSELoss()
    
    # Optimizer
    optimizer = torch.optim.Adam(net.parameters(), weight_decay = 0.0001)
    
    # Train the network
    loss_per_iter = list()
    loss_per_batch = list()
    
    for epoch in range(n_epochs):
        
        # Define params each epoch
        running_loss = 0.0
        for i, data in enumerate(trainloader):
            # print(f"index: {i}")
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Perform the usual actions of zero-ing gradients
            optimizer.zero_grad()
            
            # Forward, backward, step
            outputs = net(inputs.float())
            loss = criterion(outputs, labels.float())
            loss.backward() # Backward
            optimizer.step() # Step
            
            # Save loss to plot
            running_loss += loss.item()
            loss_per_iter.append(loss.item())
            
            if i % 200 == 199: # Print every 200 mini-batches
                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:3f}")
        
        loss_per_batch.append(running_loss / (i+1))
        running_loss = 0.0
    
    # Save the model
    torch.save(net.state_dict(), '../spaceship_titanic/spaceship_titanic_model.pt')

In [31]:
train('../data/spaceship_titanic/preprocessed_train.csv')

<torch.utils.data.dataset.Subset object at 0x7fe845195fd0>
Device is: cpu


  return F.mse_loss(input, target, reduction=self.reduction)


[1,   200] loss: 1203.723688
[2,   200] loss: 24.469788


  return F.mse_loss(input, target, reduction=self.reduction)


[3,   200] loss: 12.067620
[4,   200] loss: 7.698335
[5,   200] loss: 5.843028
[6,   200] loss: 4.365382
[7,   200] loss: 3.522465
[8,   200] loss: 3.186078
[9,   200] loss: 3.342534
[10,   200] loss: 2.629864
[11,   200] loss: 2.043056
[12,   200] loss: 2.358492
[13,   200] loss: 2.155452
[14,   200] loss: 1.906594
[15,   200] loss: 1.833351
[16,   200] loss: 2.597180
[17,   200] loss: 1.711489
[18,   200] loss: 1.683115
[19,   200] loss: 1.768632
[20,   200] loss: 1.561512
[21,   200] loss: 1.860848
[22,   200] loss: 1.778822
[23,   200] loss: 1.096048
[24,   200] loss: 1.513407
[25,   200] loss: 1.464254
[26,   200] loss: 1.409062
[27,   200] loss: 1.863907
[28,   200] loss: 0.940433
[29,   200] loss: 2.395499
[30,   200] loss: 0.802450
[31,   200] loss: 8.208389
[32,   200] loss: 0.865786
[33,   200] loss: 0.603694
[34,   200] loss: 1.573040
[35,   200] loss: 0.626593
[36,   200] loss: 0.487896
[37,   200] loss: 0.355243
[38,   200] loss: 0.633378
[39,   200] loss: 3.672196
[40,   