In [20]:
import lightgbm as lgbm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
import time
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as scs

In [2]:
train = pd.read_csv("data/titanic_data/train.csv")
test = pd.read_csv("data/titanic_data/test.csv")

In [3]:
train["isTrain"] = True
test["isTrain"] = False

tt = pd.concat([train, test]).reset_index(drop=True).copy()

In [4]:
tt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTrain
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,True
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,True
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True


In [5]:
#tt["Sex"] = tt["Sex"].map({"male": 0, "female": 1})

In [6]:
inputs = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
cat_inputs = ["Pclass", "Sex", "Embarked"]
num_inputs = ["Age", "SibSp", "Parch", "Fare"]

cat_idx = [train[inputs].columns.get_loc(col) for col in cat_inputs]

# Convert cat_features to pd.Categorical dtype
for col in cat_inputs:
    tt[col] = pd.Categorical(tt[col])

tt[inputs].head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [7]:
train.shape, test.shape, tt.shape

((891, 13), (418, 12), (1309, 13))

In [8]:
tt[inputs].isna().sum()

Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          1
Embarked      2
dtype: int64

In [9]:
cat_preproc_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("encoder", OrdinalEncoder())])
num_preproc_pipe = Pipeline([("imp", IterativeImputer())])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_preproc_pipe, num_inputs),
        ('cat', cat_preproc_pipe, cat_inputs)
    ]
)

In [36]:
clf = lgbm.LGBMClassifier(objective="binary",
                          random_state=42)

In [14]:
train = tt[tt["isTrain"]]
test = tt[~tt["isTrain"]]

In [15]:
# baseline_pipe = Pipeline([("imp", IterativeImputer()),
#                           ("clf", clf)])

baseline_pipe = Pipeline([
    ('imp', preprocessor),
    ('clf', clf)
])

In [16]:
%%time
fit_params = {"clf__categorical_feature":cat_inputs,
              "clf__feature_name":inputs}
baseline_pipe.fit(train[inputs], y=train["Survived"], **fit_params)

preds = baseline_pipe.predict_proba(train[inputs])[:,1]
#print(preds)
loss = log_loss(train["Survived"], preds)
roc_auc = roc_auc_score(train["Survived"], preds)
#accuracy = accuracy_score(train["Survived"], preds)
print(f"logloss: {loss:.5f}")
print(f"roc_auc: {roc_auc:.5f}")


logloss: 0.17711
roc_auc: 0.98830
CPU times: total: 2.3 s
Wall time: 520 ms


In [47]:
%%time
lgbm_hpsearch = {
    'clf__max_depth': scs.randint(1, 10),
    'clf__num_leaves': scs.randint(10, 100),
    'clf__n_estimators': scs.randint(10, 500)
}

lgbm_CV = RandomizedSearchCV(baseline_pipe,
                             lgbm_hpsearch,
                             cv=5,
                             scoring='roc_auc',
                             n_iter=1000,
                             #verbose=2,
                             random_state=42,)

lgbm_CV.fit(train[inputs], train["Survived"], 
            clf__categorical_feature=cat_inputs,
           clf__feature_name=inputs)
                             

CPU times: total: 2h 20min 15s
Wall time: 22min 12s


In [48]:
print(lgbm_CV.best_params_)
print(f"AUC: {round(lgbm_CV.best_score_, 3)}")

{'clf__max_depth': 2, 'clf__n_estimators': 477, 'clf__num_leaves': 84}
AUC: 0.858
