In [293]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()

In [294]:
# 1. Load data #

time1 = time.time()

path = '../input/titanic/train.csv'
df = pd.read_csv(path) 
df.drop(columns=['Name', 'Ticket', 'Cabin'],inplace=True)
pred=pd.read_csv('../input/titanic/test.csv')
pred.drop(columns=['Name', 'Ticket', 'Cabin'],inplace=True)

print(df.shape, pred.shape)
df.head()

(891, 9) (418, 8)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [295]:
# 2. pEDA #

df.Survived.value_counts()
#df.Age.hist()
#sns.scatterplot(x='Age', y='Survived', data=df)

0    549
1    342
Name: Survived, dtype: int64

In [296]:
df.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [297]:
# 3. Train-test split #

df['Age2'] = df['Age']**2
df.drop(columns = ['PassengerId'], inplace = True)
train_y = df[['Survived']]
train_x = df.drop(columns = ['Survived'])

#bin_cols = [col for col in train_x.columns if train_x[col].nunique()==2]
cat_cols = [col for col in train_x.columns if train_x[col].nunique() in range(2,10)]
num_cols = list(set(train_x.columns)-set(cat_cols))

print('categorical features: ', cat_cols, 'numerical features: ', num_cols)

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state=4)
print(X_train.shape, X_test.shape, y_train.shape)

X_train.info()

categorical features:  ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'] numerical features:  ['Age', 'Fare', 'Age2']
(712, 8) (179, 8) (712, 1)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 42 to 122
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       570 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  710 non-null    object 
 7   Age2      570 non-null    float64
dtypes: float64(3), int64(3), object(2)
memory usage: 50.1+ KB


In [298]:
# 4. Misisng values #

# add dummy for missing Age
X_train.loc[X_train['Age'].isnull(),'misAge']=1
X_train.loc[X_train['Age'].notnull(),'misAge']=0
X_test.loc[X_test['Age'].isnull(),'misAge']=1
X_test.loc[X_test['Age'].notnull(),'misAge']=0

X_train[num_cols] = X_train[num_cols].fillna(value=X_train[num_cols].median())
X_test[num_cols] = X_test[num_cols].fillna(value=X_train[num_cols].median())
X_train[cat_cols] = X_train[cat_cols].fillna(value=X_train[cat_cols].mode().iloc[0])
X_test[cat_cols] = X_test[cat_cols].fillna(value=X_train[cat_cols].mode().iloc[0])
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 42 to 122
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       712 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  712 non-null    object 
 7   Age2      712 non-null    float64
 8   misAge    712 non-null    float64
dtypes: float64(4), int64(3), object(2)
memory usage: 55.6+ KB


In [301]:
# extra feature engineering (manual)

_,bin = pd.qcut(X_train.Age, 15, retbins = True, labels = False, duplicates = 'drop')
X_train['AgeDecile'] = pd.cut(X_train.Age, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
X_test['AgeDecile'] = pd.cut(X_test.Age, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
#X_train.Age_decile.value_counts()

_,bin = pd.qcut(X_train.SibSp, 30, retbins = True, labels = False, duplicates = 'drop')
X_train['SibspNtile'] = pd.cut(X_train.SibSp, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
X_test['SibspNtile'] = pd.cut(X_test.SibSp, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)

_,bin = pd.qcut(X_train.Parch, 60, retbins = True, labels = False, duplicates = 'drop')
X_train['ParchNtile'] = pd.cut(X_train.Parch, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
X_test['ParchNtile'] = pd.cut(X_test.Parch, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)

cat_cols.extend(['misAge', 'AgeDecile', 'SibspNtile', 'ParchNtile'])
cat_cols = list(set(cat_cols)-set(['SibSp', 'Parch']))


In [302]:
cat_cols

['Embarked',
 'Pclass',
 'AgeDecile',
 'misAge',
 'Sex',
 'SibspNtile',
 'ParchNtile']

In [303]:
# 5.1 Feature engineering, dealing with skew #

skewed_vars = list(X_train.skew()[abs(X_train.skew())>3].index)

for col in X_train.columns:
    if (col in skewed_vars) and (col in num_cols):
        X_train[col] = np.log1p(X_train[col])
        X_test[col] = np.log1p(X_test[col])

In [304]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age2,misAge,AgeDecile,SibspNtile,ParchNtile
42,3,male,28.5,0,0,2.185579,C,812.25,1.0,5,0,0
684,2,male,60.0,1,1,3.688879,S,3600.00,0.0,11,0,0
605,3,male,36.0,1,0,2.806386,S,1296.00,0.0,8,0,0
409,3,female,28.5,3,1,3.275887,S,812.25,1.0,5,2,0
740,1,male,28.5,0,0,3.433987,S,812.25,1.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
360,3,male,40.0,1,4,3.363842,S,1600.00,0.0,9,0,3
709,3,male,28.5,1,1,2.787834,C,812.25,1.0,5,0,0
439,2,male,31.0,0,0,2.442347,S,961.00,0.0,6,0,0
174,1,male,56.0,0,0,3.456184,C,3136.00,0.0,11,0,0


In [305]:
# 5.2 Feature engineering #

# in general, if I plan using raw ols, I should drop one group. o/w, it is beteer to leabe all ohc groups.

feature_transformer = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), cat_cols),
    ])

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())

fewfeatures = ['num__Age', 'num__Age2', 'num__Fare', 'cat__Sex_male', 'cat__Pclass_2', 'cat__Pclass_3']

X_train

Unnamed: 0,num__Age,num__Fare,num__Age2,cat__Embarked_C,cat__Embarked_Q,cat__Embarked_S,cat__Pclass_1,cat__Pclass_2,cat__Pclass_3,cat__AgeDecile_0,...,cat__Sex_female,cat__Sex_male,cat__SibspNtile_0,cat__SibspNtile_1,cat__SibspNtile_2,cat__SibspNtile_3,cat__ParchNtile_0,cat__ParchNtile_1,cat__ParchNtile_2,cat__ParchNtile_3
0,-0.093207,-0.815590,-0.265187,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2.318849,0.720712,2.797115,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.481092,-0.181155,0.266205,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.093207,0.298654,-0.265187,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-0.093207,0.460224,-0.265187,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0.787384,0.388539,0.600144,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
708,-0.093207,-0.200114,-0.265187,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
709,0.098226,-0.553186,-0.101788,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
710,2.012556,0.482909,2.287418,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [307]:
# 6. Fit models #

lr = LogisticRegression()
param_grid = {'C':[1, 3, 10, 30, 100]}
lrm = GridSearchCV(lr, param_grid, cv=8)
lrm.fit(X_train, y_train)
print('Logistic ', lrm.best_params_, accuracy_score(y_train, lrm.predict(X_train)))

Logistic  {'C': 30} 0.8146067415730337


In [308]:
svm = SVC()
param_grid = {'C':[0.3, 1, 3, 10]}
svmm = GridSearchCV(svm, param_grid, cv=8)
svmm.fit(X_train, y_train)
print('SVM ', svmm.best_params_, accuracy_score(y_train, svmm.predict(X_train)))

SVM  {'C': 1} 0.8384831460674157


In [316]:
knn = KNeighborsClassifier()
param_grid = dict(n_neighbors=range(2,20))
knnm = GridSearchCV(knn, param_grid, cv=4)
knnm.fit(X_train[fewfeatures], y_train)
print('KNN ', knnm.best_params_, accuracy_score(y_train, knnm.predict(X_train[fewfeatures])))

KNN  {'n_neighbors': 5} 0.8637640449438202


In [317]:
time1 = time.time()
rf = RandomForestClassifier()
param_grid = {'n_estimators':[100,200], 'max_depth':[2,4,6,8], 'max_features':[4,5,6]}
rfm = GridSearchCV(rf, param_grid, cv=4)
rfm.fit(X_train, y_train)
print('RF ', rfm.best_params_, accuracy_score(y_train, rfm.predict(X_train)), time.time()-time1)

RF  {'max_depth': 8, 'max_features': 4, 'n_estimators': 100} 0.8806179775280899 27.724525690078735


In [309]:
time1 = time.time()
xgb = XGBClassifier()
# use 'gpu_hist' for more than 100,000 examples.
param_grid = {'n_estimators':[100,200], 'max_depth':[2,4,6], 'eta':[0.03, 0.05], 'subsample':[0.4, 0.6],
             'colsample_bytree':[0.6, 0.8]}
xgbm = GridSearchCV(xgb, param_grid, cv=2)
xgbm.fit(X_train, y_train)
print('XGB ', xgbm.best_params_, accuracy_score(y_train, xgbm.predict(X_train)), time.time()-time1)

XGB  {'colsample_bytree': 0.8, 'eta': 0.03, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.6} 0.8623595505617978 10.90445351600647


In [319]:
# 7. accuracy #

print('Logistic ', accuracy_score(y_test, lrm.predict(X_test)))
print('SVM ', accuracy_score(y_test, svmm.predict(X_test)))
print('KNN ', accuracy_score(y_test, knnm.predict(X_test[fewfeatures])))
print('RF ', accuracy_score(y_test, rfm.predict(X_test)))
print('XGB ', accuracy_score(y_test, xgbm.predict(X_test)))


Logistic  0.8491620111731844
SVM  0.8491620111731844
KNN  0.8324022346368715
RF  0.8435754189944135
XGB  0.8324022346368715


In [310]:
# 8. feature importance #

results = permutation_importance(xgbm, X_test, y_test, scoring='accuracy', n_jobs=-1)
fi_lr = pd.DataFrame({'col':X_test.columns, 'FI':results.importances_mean})
fi_lr.sort_values('FI', ascending = False)

Unnamed: 0,col,FI
23,cat__Sex_female,0.1150838
0,num__Age,0.03910615
8,cat__Pclass_3,0.03798883
25,cat__SibspNtile_0,0.0122905
15,cat__AgeDecile_6,0.0
31,cat__ParchNtile_2,0.0
30,cat__ParchNtile_1,0.0
28,cat__SibspNtile_3,0.0
27,cat__SibspNtile_2,0.0
26,cat__SibspNtile_1,0.0
