In [38]:
import pandas as pd

In [39]:
TRAIN_PATH = './train.csv'
TEST_PATH = './test.csv'

In [40]:
train_df = pd.read_csv(TRAIN_PATH, index_col='PassengerId')
train_df[['deck', 'num', 'side']] = train_df.pop('Cabin').str.split('/', expand=True)
train_df[['fname', 'lname']] = train_df.pop('Name').str.split(expand=True)
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,fname,lname
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P,Maham,Ofracculy
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S,Juanna,Vines
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S,Altark,Susent
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S,Solam,Susent
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S,Willy,Santantines


In [41]:
y = train_df.pop('Transported').astype(int)
X = train_df

In [42]:
num_features = ['Age', 'RoomService', 'FoodCourt',
                'ShoppingMall', 'Spa', 'VRDeck', 'num']
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP',
                'deck', 'side', 'fname', 'lname']

In [43]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

X[num_features] = num_imputer.fit_transform(X[num_features])
X[cat_features] = cat_imputer.fit_transform(X[cat_features])

fname_freq = X['fname'].value_counts()
lname_freq = X['lname'].value_counts()

X['fname'] = X['fname'].apply(lambda val: val if fname_freq.get(val, 0) > 9 else 'other')
X['lname'] = X['lname'].apply(lambda val: val if lname_freq.get(val, 0) > 9 else 'other')

X[cat_features] = X[cat_features].astype('category')

In [44]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def kfold_scorer(clf, X, y, **kwargs):
    kf = KFold(n_splits=5)
    best_score = float('-inf')
    best_clf = None

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        clf.fit(
            X_train, y_train, eval_set=[(X_val, y_val)], **kwargs
        )
        y_pred = clf.predict(X_val)
        score = accuracy_score(y_val, y_pred)
        print(f'Fold {fold}: {score}')
        if score > best_score:
            best_score = score
            best_clf = clf
            
    print('Best Score:', best_score)
    return best_clf

In [45]:
# Training CatBoostClassifier

from catboost import CatBoostClassifier

clf_dict = {}
clf = CatBoostClassifier()

best_clf = kfold_scorer(
            clf, X, y,
            cat_features=cat_features,
            verbose=False)
clf_dict['CatBoostClassifier'] = best_clf

Fold 0: 0.78953421506613
Fold 1: 0.8021851638872916
Fold 2: 0.8142610695802185
Fold 3: 0.8348676639815881
Fold 4: 0.807825086306099
Best Score: 0.8348676639815881


In [46]:
# Training XGBoostClassifier

from xgboost.sklearn import XGBClassifier

clf = XGBClassifier(tree_method='approx', enable_categorical=True)

best_clf = kfold_scorer(clf, X, y, verbose=False)
clf_dict['XGBClassifier'] = best_clf

Fold 0: 0.7613571017826337
Fold 1: 0.7941345600920069
Fold 2: 0.7998849913743531
Fold 3: 0.8394706559263522
Fold 4: 0.7905638665132336
Best Score: 0.8394706559263522


In [47]:
# Training LightGBMClassifier
from lightgbm import LGBMClassifier

clf = LGBMClassifier()

best_clf = kfold_scorer(clf, X, y, categorical_feature=cat_features, verbose=False)
clf_dict['LGBMClassifier'] = best_clf

New categorical_feature is ['CryoSleep', 'Destination', 'HomePlanet', 'VIP', 'deck', 'fname', 'lname', 'side']


Fold 0: 0.7688326624496837


New categorical_feature is ['CryoSleep', 'Destination', 'HomePlanet', 'VIP', 'deck', 'fname', 'lname', 'side']


Fold 1: 0.7975848188614146


New categorical_feature is ['CryoSleep', 'Destination', 'HomePlanet', 'VIP', 'deck', 'fname', 'lname', 'side']


Fold 2: 0.8171362852213916


New categorical_feature is ['CryoSleep', 'Destination', 'HomePlanet', 'VIP', 'deck', 'fname', 'lname', 'side']


Fold 3: 0.8365937859608745


New categorical_feature is ['CryoSleep', 'Destination', 'HomePlanet', 'VIP', 'deck', 'fname', 'lname', 'side']


Fold 4: 0.80897583429229
Best Score: 0.8365937859608745


In [60]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

device = 'cuda'
num_transformer = RobustScaler()
cat_transformer = OrdinalEncoder()

Xt = X.copy()
Xt[cat_features] = cat_transformer.fit_transform(X[cat_features])
Xt[num_features] = num_transformer.fit_transform(X[num_features])

cat_idxs = [i for i, col in enumerate(Xt.columns) if col in cat_features]
cat_dims = [len(X[cat].cat.categories) for cat in Xt.columns if cat in cat_features]

clf = TabNetClassifier(
    n_d=64, n_a=64,
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=64,
    gamma=1.5, n_independent=2, n_shared=2,
    lambda_sparse=1e-4, momentum=0.3, clip_value=2., optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2), scheduler_params = {"gamma": 0.95, "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15,
    device_name = device
)



In [61]:
from sklearn.model_selection import train_test_split

MAX_EPOCHS = 100
BATCH_SIZE = 512
VIRTUAL_BATCH_SIZE = 128

X_train, X_val, y_train, y_val = train_test_split(Xt.values, y, test_size=0.25,
                                                  shuffle=True, random_state=1)

clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], batch_size=BATCH_SIZE, max_epochs=MAX_EPOCHS, virtual_batch_size=VIRTUAL_BATCH_SIZE, patience=20, num_workers=0, weights=1, drop_last=False, eval_metric=['accuracy'])

epoch 0  | loss: 0.88383 | val_0_accuracy: 0.64581 |  0:00:03s
epoch 1  | loss: 0.66924 | val_0_accuracy: 0.5506  |  0:00:07s
epoch 2  | loss: 0.57409 | val_0_accuracy: 0.66375 |  0:00:11s
epoch 3  | loss: 0.561   | val_0_accuracy: 0.68307 |  0:00:15s
epoch 4  | loss: 0.54576 | val_0_accuracy: 0.71159 |  0:00:19s
epoch 5  | loss: 0.52874 | val_0_accuracy: 0.68721 |  0:00:22s
epoch 6  | loss: 0.52498 | val_0_accuracy: 0.69779 |  0:00:26s
epoch 7  | loss: 0.5165  | val_0_accuracy: 0.69779 |  0:00:30s
epoch 8  | loss: 0.52113 | val_0_accuracy: 0.70515 |  0:00:34s
epoch 9  | loss: 0.51389 | val_0_accuracy: 0.71803 |  0:00:37s
epoch 10 | loss: 0.51608 | val_0_accuracy: 0.71251 |  0:00:41s
epoch 11 | loss: 0.51945 | val_0_accuracy: 0.71757 |  0:00:45s
epoch 12 | loss: 0.50671 | val_0_accuracy: 0.72171 |  0:00:48s
epoch 13 | loss: 0.51387 | val_0_accuracy: 0.71297 |  0:00:52s
epoch 14 | loss: 0.50844 | val_0_accuracy: 0.73091 |  0:00:56s
epoch 15 | loss: 0.49672 | val_0_accuracy: 0.73045 |  0



In [62]:
class TabNetPredictor:
    
    def __init__(self, clf):
        self.clf = clf
    
    def predict(self, X):
        Xt = X.copy()
        Xt[cat_features] = cat_transformer.transform(X[cat_features])
        Xt[num_features] = num_transformer.transform(X[num_features])
        return self.clf.predict(Xt.values)
        
clf_dict['TabNetClassifier'] = TabNetPredictor(clf)

In [63]:
for name, clf in clf_dict.items():
    y_pred = clf.predict(X)
    score = accuracy_score(y, y_pred)
    print(f'{name}: {score}')

CatBoostClassifier: 0.8519498447026344
XGBClassifier: 0.9126883699528356
LGBMClassifier: 0.875647072357069
TabNetClassifier: 0.8181295295064995


In [64]:
test_df = pd.read_csv(TEST_PATH, index_col='PassengerId')
test_df[['deck', 'num', 'side']] = test_df.pop('Cabin').str.split('/', expand=True)
test_df[['fname', 'lname']] = test_df.pop('Name').str.split(expand=True)

test_df[num_features] = num_imputer.transform(test_df[num_features])
test_df[cat_features] = cat_imputer.transform(test_df[cat_features])


test_df['fname'] = test_df['fname'].apply(lambda val: val if fname_freq.get(val, 0) > 9 else 'other')
test_df['lname'] = test_df['lname'].apply(lambda val: val if lname_freq.get(val, 0) > 9 else 'other')

test_df[cat_features] = test_df[cat_features].astype('category')

In [65]:
import numpy as np

y_preds = []
for name, clf in clf_dict.items():
    y_preds.append(clf.predict(test_df)[..., None])
    
weights = [2, 2, 1, 1]
y_pred = (np.hstack(y_preds)*np.array(weights)).sum(axis=1)/sum(weights) >= 0.5

In [66]:
submit_df = pd.read_csv('./sample_submission.csv', index_col='PassengerId')
submit_df['Transported'] = y_pred

In [67]:
submit_df.to_csv('submit.csv')