In [1]:
import lightgbm as lgbm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("data/titanic_data/train.csv")
test = pd.read_csv("data/titanic_data/test.csv")

In [3]:
train["isTrain"] = True
test["isTrain"] = False

tt = pd.concat([train, test]).reset_index(drop=True).copy()

In [4]:
tt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTrain
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,True
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,True
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True


In [5]:
tt["Sex"] = tt["Sex"].map({"male": 0, "female": 1})

In [6]:
inputs = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
cat_inputs = ["Pclass", "Sex"]
num_inputs = ["Age", "SibSp", "Parch", "Fare"]

cat_idx = [train[inputs].columns.get_loc(col) for col in cat_inputs]

# Convert cat_features to pd.Categorical dtype
for col in cat_inputs:
    tt[col] = pd.Categorical(tt[col])

tt[inputs].head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,3,0,35.0,0,0,8.05


In [7]:
train.shape, test.shape, tt.shape

((891, 13), (418, 12), (1309, 13))

In [8]:
tt[inputs].isna().sum()

Pclass      0
Sex         0
Age       263
SibSp       0
Parch       0
Fare        1
dtype: int64

In [9]:
cat_preproc_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("encoder", LabelEncoder())])
num_preproc_pipe = Pipeline([("imp", IterativeImputer())])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_preproc_pipe, num_inputs),
        ('cat', cat_preproc_pipe, cat_inputs)
    ]
)

In [11]:
clf = lgbm.LGBMClassifier()

In [12]:
train = tt[tt["isTrain"]]
test = tt[~tt["isTrain"]]

In [13]:
baseline_pipe = Pipeline([("imp", IterativeImputer()),
                          ("clf", clf)])

In [14]:
%%time
fit_params = {"clf__categorical_feature":cat_inputs,
              "clf__feature_name":inputs}
baseline_pipe.fit(train[inputs], y=train["Survived"], **fit_params)

preds = baseline_pipe.predict_proba(train[inputs])[:,1]
#print(preds)
loss = log_loss(train["Survived"], preds)
roc_auc = roc_auc_score(train["Survived"], preds)
#accuracy = accuracy_score(train["Survived"], preds)
print(f"logloss: {loss:.5f}")
print(f"roc_auc: {roc_auc:.5f}")


logloss: 0.15894
roc_auc: 0.99093
CPU times: total: 1.17 s
Wall time: 297 ms


In [15]:
from sklearn.model_selection import StratifiedKFold

N_SPLITS = 7
strat_kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=1121218)

scores = np.empty(N_SPLITS)
X, y = train[inputs], train["Survived"]
for idx, (train_idx, test_idx) in enumerate(strat_kf.split(X, y)):
    print("=" * 12 + f"Training fold {idx}" + 12 * "=")
    start = time.time()

    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_val = y[train_idx], y[test_idx]
    eval_set = [(X_val, y_val)]
    
    fit_params = {"clf__categorical_feature":cat_inputs,
                  "clf__feature_name":inputs,
                  "clf__eval_metric":"binary_logloss"}
    
    baseline_pipe.fit(X_train, y_train, **fit_params)
    
    preds = baseline_pipe.predict_proba(X_val)[:,1]
    loss = log_loss(y_val, preds)
    roc_auc = roc_auc_score(y_val, preds)
    #accuracy = accuracy_score(train["Survived"], preds)
    print(f"logloss: {loss:.5f}")
    print(f"roc_auc: {roc_auc:.5f}")
    runtime = time.time() - start
    print(f"Fold {idx} finished with score: {loss:.5f} in {runtime:.2f} seconds.\n")

logloss: 0.61809
roc_auc: 0.83066
Fold 0 finished with score: 0.61809 in 0.19 seconds.

logloss: 0.39990
roc_auc: 0.90429
Fold 1 finished with score: 0.39990 in 0.14 seconds.

logloss: 0.31262
roc_auc: 0.92524
Fold 2 finished with score: 0.31262 in 0.17 seconds.

logloss: 0.44455
roc_auc: 0.87781
Fold 3 finished with score: 0.44455 in 0.17 seconds.

logloss: 0.52153
roc_auc: 0.84563
Fold 4 finished with score: 0.52153 in 0.18 seconds.

logloss: 0.51910
roc_auc: 0.84956
Fold 5 finished with score: 0.51910 in 0.17 seconds.

logloss: 0.41729
roc_auc: 0.88161
Fold 6 finished with score: 0.41729 in 0.18 seconds.

