In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# DAE : Does Denoising Autoencoder

credits: https://www.kaggle.com/adegladius/tbapril21-data
https://www.kaggle.com/jeongyoonlee/dae-with-2-lines-of-code-with-kaggler

In [2]:
# imports

import lightgbm as lgb
import numpy as np
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix
import warnings



In [3]:
!pip install kaggler

Collecting kaggler
  Downloading Kaggler-0.9.4.tar.gz (820 kB)
[K     |████████████████████████████████| 820 kB 4.3 MB/s eta 0:00:01
Building wheels for collected packages: kaggler
  Building wheel for kaggler (setup.py) ... [?25ldone
[?25h  Created wheel for kaggler: filename=Kaggler-0.9.4-cp37-cp37m-linux_x86_64.whl size=2958497 sha256=1073e588853cb66f9d287a56ee0512a6349dcc75e7d76c10cf9717522a8dd293
  Stored in directory: /root/.cache/pip/wheels/7e/ef/b7/f249348c07943183235167e6208e3a3571cfa96ae2f8218d6c
Successfully built kaggler
Installing collected packages: kaggler
Successfully installed kaggler-0.9.4


In [4]:
import kaggler
from kaggler.model import AutoLGB
from kaggler.preprocessing import DAE, TargetEncoder, LabelEncoder

print(f'Kaggler: {kaggler.__version__}')


Kaggler: 0.9.4


In [5]:
warnings.simplefilter('ignore')
pd.set_option('max_columns', 100)

In [13]:
feature_name = 'dae_te'
algo_name = 'lgb'
model_name = f'{algo_name}_{feature_name}'

data_dir = Path('/kaggle/input/tabular-playground-series-apr-2021/')
trn_file = '../input/tbapril21-data/train_titanic_tb.csv'
tst_file = '../input/tbapril21-data/test_titanic_tb.csv'
sample_file = '../input/tbapril21-data/sample_submission_tb.csv'
pseudo_label_file = '../input/tps-apr-2021-pseudo-label-dae/REMEK-TPS04-FINAL005.csv'

feature_file = f'{feature_name}.csv'
predict_val_file = f'{model_name}.val.txt'
predict_tst_file = f'{model_name}.tst.txt'
submission_file = f'{model_name}.sub.csv'

target_col = 'Survived'
id_col = 'PassengerId'


In [11]:
n_fold = 5
seed = 42
encoding_dim = 64

In [14]:
trn = pd.read_csv(trn_file, index_col=id_col)
tst = pd.read_csv(tst_file, index_col=id_col)
sub = pd.read_csv(sample_file, index_col=id_col)
pseudo_label = pd.read_csv(pseudo_label_file, index_col=id_col)
print(trn.shape, tst.shape, sub.shape, pseudo_label.shape)

(100000, 11) (100000, 10) (100000, 1) (100000, 1)


In [15]:
tst[target_col] = pseudo_label[target_col]
n_trn = trn.shape[0]
df = pd.concat([trn, tst], axis=0)
df.head()


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [16]:
# Feature engineering code from https://www.kaggle.com/udbhavpangotra/tps-apr21-eda-model

df['Embarked'] = df['Embarked'].fillna('No')
df['Cabin'] = df['Cabin'].fillna('_')
df['CabinType'] = df['Cabin'].apply(lambda x:x[0])
df.Ticket = df.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

df['Age'].fillna(round(df['Age'].median()), inplace=True,)
df['Age'] = df['Age'].apply(round).astype(int)

# Fare, fillna with mean value
fare_map = df[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
df['Fare'] = df['Fare'].fillna(df['Pclass'].map(fare_map['Fare']))

df['FirstName'] = df['Name'].str.split(', ').str[0]
df['SecondName'] = df['Name'].str.split(', ').str[1]

df['n'] = 1

gb = df.groupby('FirstName')
df_names = gb['n'].sum()
df['SameFirstName'] = df['FirstName'].apply(lambda x:df_names[x]).fillna(1)

gb = df.groupby('SecondName')
df_names = gb['n'].sum()
df['SameSecondName'] = df['SecondName'].apply(lambda x:df_names[x]).fillna(1)

df['Sex'] = (df['Sex'] == 'male').astype(int)

df['FamilySize'] = df.SibSp + df.Parch + 1

feature_cols = ['Pclass', 'Age','Embarked','Parch','SibSp','Fare','CabinType','Ticket','SameFirstName', 'SameSecondName', 'Sex',
                'FamilySize', 'FirstName', 'SecondName']
cat_cols = ['Pclass','Embarked','CabinType','Ticket', 'FirstName', 'SecondName']
num_cols = [x for x in feature_cols if x not in cat_cols]
print(len(feature_cols), len(cat_cols), len(num_cols))

14 6 8


In [17]:
for col in ['SameFirstName', 'SameSecondName', 'Fare', 'FamilySize', 'Parch', 'SibSp']:
    df[col] = np.log2(1 + df[col])
    
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


Label encoding with rare category grouping and missing value imputation

In [18]:
lbe = LabelEncoder(min_obs=50)
df[cat_cols] = lbe.fit_transform(df[cat_cols]).astype(int)


Target encoding with smoothing and 5-fold cross-validation

In [19]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
te = TargetEncoder(cv=cv)
df_te = te.fit_transform(df[cat_cols], df[target_col])
df_te.columns = [f'te_{col}' for col in cat_cols]
df_te.head()


Unnamed: 0_level_0,te_Pclass,te_Embarked,te_CabinType,te_Ticket,te_FirstName,te_SecondName
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.590054,0.263077,0.725503,0.362868,0.385527,0.318182
1,0.3856,0.3856,0.3856,0.3856,0.3856,0.3856
2,0.23734,0.263239,0.282278,0.269231,0.410853,0.143337
3,0.238793,0.263077,0.284236,0.098983,0.523807,0.146459
4,0.237259,0.26363,0.282163,0.363188,0.385958,0.143773


DAE

In [20]:
dae = DAE(cat_cols=cat_cols, num_cols=num_cols, encoding_dim=encoding_dim)
X = dae.fit_transform(df[feature_cols])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Restoring model weights from the end of the best epoch.
Epoch 00044: early stopping


In [21]:
df_dae = pd.DataFrame(X, columns=[f'dae_{i}' for i in range(encoding_dim)])
print(df_dae.shape)


(200000, 64)


### Part 2: Model Training

AutoLGB for Feature Selection and Hyperparameter Optimization

In [22]:
X = pd.concat([df[feature_cols], df_te, df_dae], axis=1)
y = df[target_col]
X_tst = X.iloc[n_trn:]

p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((tst.shape[0],))
print(f'Training a stacking ensemble LightGBM model:')
for i, (i_trn, i_val) in enumerate(cv.split(X, y)):
    if i == 0:
        clf = AutoLGB(objective='binary', metric='auc', sample_size=len(i_trn), random_state=seed)
        clf.tune(X.iloc[i_trn], y[i_trn])
        features = clf.features
        params = clf.params
        n_best = clf.n_best
        print(f'{n_best}')
        print(f'{params}')
        print(f'{features}')
    
    trn_data = lgb.Dataset(X.iloc[i_trn], y[i_trn])
    val_data = lgb.Dataset(X.iloc[i_val], y[i_val])
    clf = lgb.train(params, trn_data, n_best, val_data, verbose_eval=100)
    p[i_val] = clf.predict(X.iloc[i_val])
    p_tst += clf.predict(X_tst) / n_fold
    print(f'CV #{i + 1} AUC: {roc_auc_score(y[i_val], p[i_val]):.6f}')


Training a stacking ensemble LightGBM model:
100%|██████████| 10/10 [00:35<00:00,  3.54s/trial, best loss: -0.9379150852807484]
100%|██████████| 100/100 [08:38<00:00,  5.18s/trial, best loss: -0.9381988224079066]
429
{'bagging_freq': 1, 'verbosity': -1, 'seed': 42, 'num_threads': -1, 'feature_pre_filter': False, 'objective': 'binary', 'metric': 'auc', 'boosting': 'gbdt', 'bagging_fraction': 0.9, 'feature_fraction': 0.8, 'lambda_l1': 1, 'lambda_l2': 0, 'learning_rate': 0.016462967247240133, 'max_depth': 8, 'min_child_samples': 10, 'num_leaves': 127}
['Sex', 'Embarked', 'te_CabinType', 'Pclass', 'CabinType', 'dae_52', 'te_Ticket', 'dae_41', 'Fare', 'Ticket', 'te_Embarked', 'dae_60', 'te_SecondName', 'Age', 'te_Pclass', 'dae_32', 'dae_18', 'dae_46', 'dae_21', 'dae_6', 'dae_29', 'dae_59', 'dae_26', 'dae_53', 'dae_35', 'dae_5', 'dae_2', 'dae_62', 'dae_14', 'dae_38', 'dae_12', 'dae_55']
[100]	valid_0's auc: 0.937772
[200]	valid_0's auc: 0.938963
[300]	valid_0's auc: 0.939564
[400]	valid_0's 

In [23]:
np.savetxt(predict_val_file, p, fmt='%.6f')
np.savetxt(predict_tst_file, p_tst, fmt='%.6f')


In [24]:
print(f'  CV AUC: {roc_auc_score(y, p):.6f}')
print(f'Test AUC: {roc_auc_score(pseudo_label[target_col], p_tst)}')

  CV AUC: 0.935012
Test AUC: 0.9999363121805189


In [25]:
n_pos = int(0.34911 * tst.shape[0])
th = sorted(p_tst, reverse=True)[n_pos]
print(th)
confusion_matrix(pseudo_label[target_col], (p_tst > th).astype(int))


0.32484361188468314


array([[65069,   586],
       [   20, 34325]])

In [26]:
sub[target_col] = (p_tst > th).astype(int)
sub.to_csv(submission_file)