In [8]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()

# 1. Load data #

time0 = time.time()

path = '../input/titanic/train.csv'
df = pd.read_csv(path) 
df.drop(columns=['Name', 'Ticket', 'Cabin'],inplace=True)
pred=pd.read_csv('../input/titanic/test.csv')
pred.drop(columns=['Name', 'Ticket', 'Cabin'],inplace=True)

print(df.shape, pred.shape)
#df.head()

# 2. pEDA #

#df.Survived.value_counts()

# 3. Train-test split #

df['Age2'] = df['Age']**2
df.drop(columns = ['PassengerId'], inplace = True)
train_y = df[['Survived']]
train_x = df.drop(columns = ['Survived'])

#bin_cols = [col for col in train_x.columns if train_x[col].nunique()==2]
cat_cols = [col for col in train_x.columns if train_x[col].nunique() in range(2,10)]
num_cols = list(set(train_x.columns)-set(cat_cols))

print('categorical features: ', cat_cols, 'numerical features: ', num_cols)

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state=9)
print(X_train.shape, X_test.shape, y_train.shape)

X_train.info()

# 4. Misisng values #

# add dummy for missing Age
X_train.loc[X_train['Age'].isnull(),'misAge']=1
X_train.loc[X_train['Age'].notnull(),'misAge']=0
X_test.loc[X_test['Age'].isnull(),'misAge']=1
X_test.loc[X_test['Age'].notnull(),'misAge']=0

X_train[num_cols] = X_train[num_cols].fillna(value=X_train[num_cols].median())
X_test[num_cols] = X_test[num_cols].fillna(value=X_train[num_cols].median())
X_train[cat_cols] = X_train[cat_cols].fillna(value=X_train[cat_cols].mode().iloc[0])
X_test[cat_cols] = X_test[cat_cols].fillna(value=X_train[cat_cols].mode().iloc[0])
X_train.info()


# extra feature engineering (manual)

_,bin = pd.qcut(X_train.Age, 15, retbins = True, labels = False, duplicates = 'drop')
X_train['AgeDecile'] = pd.cut(X_train.Age, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
X_test['AgeDecile'] = pd.cut(X_test.Age, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
#X_train.Age_decile.value_counts()

_,bin = pd.qcut(X_train.SibSp, 30, retbins = True, labels = False, duplicates = 'drop')
X_train['SibspNtile'] = pd.cut(X_train.SibSp, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
X_test['SibspNtile'] = pd.cut(X_test.SibSp, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)

_,bin = pd.qcut(X_train.Parch, 60, retbins = True, labels = False, duplicates = 'drop')
X_train['ParchNtile'] = pd.cut(X_train.Parch, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
X_test['ParchNtile'] = pd.cut(X_test.Parch, labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)

cat_cols.extend(['misAge', 'AgeDecile', 'SibspNtile', 'ParchNtile'])
cat_cols = list(set(cat_cols)-set(['SibSp', 'Parch']))


# 5.1 Feature engineering, dealing with skew #

skewed_vars = list(X_train.skew()[abs(X_train.skew())>3].index)

for col in X_train.columns:
    if (col in skewed_vars) and (col in num_cols):
        X_train[col] = np.log1p(X_train[col])
        X_test[col] = np.log1p(X_test[col])

# 5.2 Feature engineering #

# in general, if I plan using raw ols, I should drop one group. o/w, it is beteer to leabe all ohc groups.

feature_transformer = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), cat_cols),
    ])

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())

fewfeatures = ['num__Age', 'num__Age2', 'num__Fare', 'cat__Sex_male', 'cat__Pclass_2', 'cat__Pclass_3']

X_train

# 6. Fit models #

lr = LogisticRegression()
param_grid = {'C':[0.3, 1, 3, 10, 30, 100]}
lrm = GridSearchCV(lr, param_grid, cv=8)
lrm.fit(X_train, y_train)
print('Logistic ', lrm.best_params_, accuracy_score(y_train, lrm.predict(X_train)))

svm = SVC()
param_grid = {'C':[0.3, 1, 3, 10]}
svmm = GridSearchCV(svm, param_grid, cv=8)
svmm.fit(X_train, y_train)
print('SVM ', svmm.best_params_, accuracy_score(y_train, svmm.predict(X_train)))

knn = KNeighborsClassifier()
param_grid = dict(n_neighbors=range(2,20))
knnm = GridSearchCV(knn, param_grid, cv=8)
knnm.fit(X_train[fewfeatures], y_train)
print('KNN ', knnm.best_params_, accuracy_score(y_train, knnm.predict(X_train[fewfeatures])))

time1 = time.time()
rf = RandomForestClassifier()
param_grid = {'n_estimators':[100,200], 'max_depth':[2,4,6,8], 'max_features':[4,5,6]}
rfm = GridSearchCV(rf, param_grid, cv=4)
rfm.fit(X_train, y_train)
print('RF ', rfm.best_params_, accuracy_score(y_train, rfm.predict(X_train)), time.time()-time1)

time1 = time.time()
xgb = XGBClassifier()
# use 'gpu_hist' for more than 100,000 examples.
param_grid = {'n_estimators':[100,200], 'max_depth':[2,4,6], 'eta':[0.03, 0.05], 'subsample':[0.4, 0.6],
             'colsample_bytree':[0.6, 0.8]}
xgbm = GridSearchCV(xgb, param_grid, cv=2)
xgbm.fit(X_train, y_train)
print('XGB ', xgbm.best_params_, accuracy_score(y_train, xgbm.predict(X_train)), time.time()-time1)

# 7. accuracy #

print('Out of Sample:')
print('Logistic ', accuracy_score(y_test, lrm.predict(X_test)))
print('SVM ', accuracy_score(y_test, svmm.predict(X_test)))
print('KNN ', accuracy_score(y_test, knnm.predict(X_test[fewfeatures])))
print('RF ', accuracy_score(y_test, rfm.predict(X_test)))
print('XGB ', accuracy_score(y_test, xgbm.predict(X_test)))
print('Total time ', time.time()-time0)

# VotingClassifier:

estimator = []
estimator.append(('LR', LogisticRegression(C=10)))
estimator.append(('SVM', SVC(C=1, probability = True)))
#estimator.append(('KNN', KNeighborsClassifier(n_neighbors=4)))
estimator.append(('RF', RandomForestClassifier(max_depth=6, max_features=4, n_estimators=200)))
estimator.append(('XGB', XGBClassifier(eta=0.4, max_depth=4, n_estimators=200, 
                                       subsample=0.6, colsample_bytree=0.8)))
vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
vot_soft.fit(X_train, y_train)
print('VotingClassifier5 ', accuracy_score(y_train, vot_soft.predict(X_train)))
print('VotingClassifier5 ', accuracy_score(y_test, vot_soft.predict(X_test)))


In [310]:
# 8. feature importance #

results = permutation_importance(xgbm, X_test, y_test, scoring='accuracy', n_jobs=-1)
fi_lr = pd.DataFrame({'col':X_test.columns, 'FI':results.importances_mean})
fi_lr.sort_values('FI', ascending = False)

Unnamed: 0,col,FI
23,cat__Sex_female,0.1150838
0,num__Age,0.03910615
8,cat__Pclass_3,0.03798883
25,cat__SibspNtile_0,0.0122905
15,cat__AgeDecile_6,0.0
31,cat__ParchNtile_2,0.0
30,cat__ParchNtile_1,0.0
28,cat__SibspNtile_3,0.0
27,cat__SibspNtile_2,0.0
26,cat__SibspNtile_1,0.0
