### This is Prod script, building and deploying simple XGB model for Titanic

Since it uses only RF and XGBoost, it is simpler than dev script for this project


In [25]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings, optuna

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 20)
warnings.filterwarnings('ignore')

# Load custom pre-processing functions:

def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)"""
    # set df_pred to None if it does not exist
    if not ((cat_fill=='mode') and (num_fill=='median')):
        print ('Imputation method not Implemented yet!')
        return None
    
    df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
    df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
    df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    if (df_pred is not None):
        df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())
        df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_train[num_features+cat_features].count
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])"""
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   


In [26]:
# 1. Load data #

time0 = time.time()

os.chdir('/home/jupyter/projects_data/titanic')
df = pd.read_csv('train.csv') 

df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],inplace=True)
pred = pd.read_csv('test.csv')
pred0 = pred.copy()
pred.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],inplace=True)

print(df.shape, pred.shape)
display(df.head())

(891, 8) (418, 7)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [27]:
# 2. EDA, adding features #

df['Age2'] = df['Age']**2
pred['Age2'] = pred['Age']**2

# 3. Train-test split #

train_y = df[['Survived']]
train_x = df.drop(columns = ['Survived'])
X_pred = pred.copy()

cat_cols = ['Sex', 'Embarked']
num_cols = list(set(train_x.columns)-set(cat_cols))

print('categorical features: ', cat_cols, 'numerical features: ', num_cols)

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, X_pred.shape)

X_train.info()

categorical features:  ['Sex', 'Embarked'] numerical features:  ['Parch', 'Pclass', 'Age2', 'SibSp', 'Fare', 'Age']
(801, 8) (90, 8) (801, 1) (418, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 801 entries, 825 to 863
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    801 non-null    int64  
 1   Sex       801 non-null    object 
 2   Age       646 non-null    float64
 3   SibSp     801 non-null    int64  
 4   Parch     801 non-null    int64  
 5   Fare      801 non-null    float64
 6   Embarked  799 non-null    object 
 7   Age2      646 non-null    float64
dtypes: float64(3), int64(3), object(2)
memory usage: 56.3+ KB


In [28]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age2
825,3,male,,0,0,6.95,Q,
8,3,female,27.0,0,2,11.1333,S,729.0
689,1,female,15.0,0,1,211.3375,S,225.0
513,1,female,54.0,1,0,59.4,C,2916.0
729,3,female,25.0,1,0,7.925,S,625.0


In [30]:
# 4. Misisng values #

add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])

fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)
#[X_train.count(), X_test.count(), X_pred.count()]

cat_cols.extend(['misAge'])

feature_transformer = ColumnTransformer([
        ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore", drop='if_binary'), cat_cols)],
        remainder = "passthrough"
    )

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
X_pred = pd.DataFrame(feature_transformer.transform(X_pred), columns=feature_transformer.get_feature_names_out())

Missing values imputed successfully


Unnamed: 0,cat__Sex_male,cat__Embarked_C,cat__Embarked_Q,cat__Embarked_S,cat__misAge_1.0,remainder__Pclass,remainder__Age,remainder__SibSp,remainder__Parch,remainder__Fare,remainder__Age2
0,1.0,0.0,1.0,0.0,1.0,3.0,28.0,0.0,0.0,6.9500,784.0
1,0.0,0.0,0.0,1.0,0.0,3.0,27.0,0.0,2.0,11.1333,729.0
2,0.0,0.0,0.0,1.0,0.0,1.0,15.0,0.0,1.0,211.3375,225.0
3,0.0,1.0,0.0,0.0,0.0,1.0,54.0,1.0,0.0,59.4000,2916.0
4,0.0,0.0,0.0,1.0,0.0,3.0,25.0,1.0,0.0,7.9250,625.0
...,...,...,...,...,...,...,...,...,...,...,...
796,1.0,0.0,0.0,1.0,0.0,3.0,19.0,0.0,0.0,14.5000,361.0
797,1.0,0.0,0.0,1.0,0.0,3.0,32.0,0.0,0.0,56.4958,1024.0
798,0.0,1.0,0.0,0.0,0.0,1.0,41.0,0.0,0.0,134.5000,1681.0
799,0.0,1.0,0.0,0.0,0.0,1.0,44.0,0.0,1.0,57.9792,1936.0


In [38]:
# 6. Fit models #

time1 = time.time()
rf = RandomForestClassifier()
param_grid = {'n_estimators':[100,200], 
              'max_depth':[2,4,6,8], 
              'max_features':[4,5,6]}
rfm = GridSearchCV(rf, param_grid, cv=2)
rfm.fit(X_train, y_train)
print('RF ', 
      rfm.best_params_, 
      '\n',
      accuracy_score(y_train, rfm.predict(X_train)), 
      roc_auc_score(y_train, rfm.predict(X_train)), time.time()-time1)

time1 = time.time()
xgb = XGBClassifier()
# use 'gpu_hist' for more than 10,000 examples.
param_grid = {'n_estimators':[200], 
              'max_depth':[3,4], 
              'eta':[0.03, 0.04, 0.05], 
              'subsample':[0.8],
             'colsample_bytree':[0.6]}
xgbm = GridSearchCV(xgb, param_grid, cv=2)
xgbm.fit(X_train, y_train)
print('XGB ', 
      xgbm.best_params_, 
      '\n',
      accuracy_score(y_train, xgbm.predict(X_train)), 
      roc_auc_score(y_train, xgbm.predict(X_train)), 
      time.time()-time1)


# 7. model evaluation #

print('Out of Sample:')
print('RF ', 
      accuracy_score(y_test, rfm.predict(X_test)), 
      roc_auc_score(y_test, rfm.predict(X_test)))
print('XGB ', 
      accuracy_score(y_test, xgbm.predict(X_test)), 
      roc_auc_score(y_test, xgbm.predict(X_test)))
print('Total time ', time.time()-time0)

print('Total time for script: ', time.time() - time0)

RF  {'max_depth': 6, 'max_features': 4, 'n_estimators': 100} 
 0.8689138576779026 0.8415940991689531 11.240473747253418
XGB  {'colsample_bytree': 0.6, 'eta': 0.03, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8} 
 0.8651685393258427 0.8411666467851604 6.390944004058838
Out of Sample:
RF  0.8222222222222222 0.8009049773755655
XGB  0.8 0.7782805429864253
Total time  409.2218556404114
Total time for script:  409.2219297885895
