### This is a script, building simple RF model for Titanic



In [1]:
project_name = 'My First Project'
project_id = 'valid-heuristic-369117'
regionn = 'us-west1'

In [2]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import seaborn as sns
import os, time, warnings, optuna, pickle, joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 20)
warnings.filterwarnings('ignore')

# Load custom pre-processing functions:

def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)"""
    # set df_pred to None if it does not exist
    if not ((cat_fill=='mode') and (num_fill=='median')):
        print ('Imputation method not Implemented yet!')
        return None
    
    df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
    df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
    df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    if (df_pred is not None):
        df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())
        df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_train[num_features+cat_features].count
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])"""
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

In [3]:
# 1. Load data #

time0 = time.time()

os.chdir('/home/jupyter/projects_data/titanic')
df = pd.read_csv('train.csv') 

df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],inplace=True)
pred = pd.read_csv('test.csv')
pred.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],inplace=True)

print(df.shape, pred.shape)

(891, 8) (418, 7)


In [4]:
# 2. EDA, adding features #

df['Age2'] = df['Age']**2
pred['Age2'] = pred['Age']**2

# 3. Train-test split #

train_y = df[['Survived']]
train_x = df.drop(columns = ['Survived'])
X_pred = pred.copy()

cat_cols = ['Sex', 'Embarked']
num_cols = list(set(train_x.columns)-set(cat_cols))

print('categorical features: ', cat_cols, 'numerical features: ', num_cols)

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state=4)
print(X_train.shape, X_test.shape, y_train.shape, X_pred.shape)

X_train.info()

categorical features:  ['Sex', 'Embarked'] numerical features:  ['Age2', 'Age', 'Parch', 'Fare', 'SibSp', 'Pclass']
(712, 8) (179, 8) (712, 1) (418, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 42 to 122
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       570 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  710 non-null    object 
 7   Age2      570 non-null    float64
dtypes: float64(3), int64(3), object(2)
memory usage: 50.1+ KB


In [5]:
# 4. Misisng values #

add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])

fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)

cat_cols.extend(['misAge'])

feature_transformer = ColumnTransformer([
        ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore", drop='if_binary'), cat_cols)],
        remainder = "passthrough"
    )

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
X_pred = pd.DataFrame(feature_transformer.transform(X_pred), columns=feature_transformer.get_feature_names_out())

Missing values imputed successfully


In [6]:
# 6. Fit models #

time1 = time.time()
rf = RandomForestClassifier()
param_grid = {'n_estimators':[100, 200], 
              'max_depth':[3, 4, 5, 6, 7], 
              'max_features':[4, 5, 6]}
rfm = GridSearchCV(rf, param_grid, cv=2)
rfm.fit(X_train, y_train)
print('RF ', 
      rfm.best_params_, 
      '\n',
      accuracy_score(y_train, rfm.predict(X_train)), 
      roc_auc_score(y_train, rfm.predict(X_train)), time.time()-time1)

time1 = time.time()
xgb = XGBClassifier()
# use 'gpu_hist' for more than 10,000 examples.
param_grid = {'n_estimators':[150, 250], 
              'max_depth':[2, 3, 4], 
              'eta':[0.01, 0.02, 0.03, 0.04, 0.05, 0.06], 
              'subsample':[0.7],
              'colsample_bytree':[0.6]}
xgbm = GridSearchCV(xgb, param_grid, cv=2)
xgbm.fit(X_train, y_train)
print('XGB ', 
      xgbm.best_params_, 
      '\n',
      accuracy_score(y_train, xgbm.predict(X_train)), 
      roc_auc_score(y_train, xgbm.predict(X_train)), 
      time.time()-time1)


# 7. model evaluation #

print('Out of Sample:')
print('RF ', 
      accuracy_score(y_test, rfm.predict(X_test)), 
      roc_auc_score(y_test, rfm.predict(X_test)))
print('XGB ', 
      accuracy_score(y_test, xgbm.predict(X_test)), 
      roc_auc_score(y_test, xgbm.predict(X_test)))
print('Total time ', time.time()-time0)

print('Total time for training part: ', time.time() - time0)

RF  {'max_depth': 5, 'max_features': 4, 'n_estimators': 100} 
 0.8553370786516854 0.8314118423222827 12.838726043701172
XGB  {'colsample_bytree': 0.6, 'eta': 0.01, 'max_depth': 4, 'n_estimators': 250, 'subsample': 0.7} 
 0.8581460674157303 0.8386194952993569 33.143598318099976
Out of Sample:
RF  0.8379888268156425 0.7913865546218487
XGB  0.8212290502793296 0.7829131652661064
Total time  46.11894345283508
Total time for training part:  46.119004249572754


The results are somewhat surprising. I have played for more than 1 hours with hyprparmeters and RF still usually beats XGB. 
If I do hyperparemter tuning rigorously (e.g., Optuna), xgb will probably beat RF eventually. But do not want to waste more time on this, given that thi is Prod script. So I use RF.

In [7]:
X_train

Unnamed: 0,cat__Sex_male,cat__Embarked_C,cat__Embarked_Q,cat__Embarked_S,cat__misAge_1.0,remainder__Pclass,remainder__Age,remainder__SibSp,remainder__Parch,remainder__Fare,remainder__Age2
0,1.0,1.0,0.0,0.0,1.0,3.0,28.5,0.0,0.0,7.8958,812.25
1,1.0,0.0,0.0,1.0,0.0,2.0,60.0,1.0,1.0,39.0000,3600.00
2,1.0,0.0,0.0,1.0,0.0,3.0,36.0,1.0,0.0,15.5500,1296.00
3,0.0,0.0,0.0,1.0,1.0,3.0,28.5,3.0,1.0,25.4667,812.25
4,1.0,0.0,0.0,1.0,1.0,1.0,28.5,0.0,0.0,30.0000,812.25
...,...,...,...,...,...,...,...,...,...,...,...
707,1.0,0.0,0.0,1.0,0.0,3.0,40.0,1.0,4.0,27.9000,1600.00
708,1.0,1.0,0.0,0.0,1.0,3.0,28.5,1.0,1.0,15.2458,812.25
709,1.0,0.0,0.0,1.0,0.0,2.0,31.0,0.0,0.0,10.5000,961.00
710,1.0,1.0,0.0,0.0,0.0,1.0,56.0,0.0,0.0,30.6958,3136.00


In [8]:
os.chdir('/home/jupyter/project_repos/pg_titanic/pg_titanic/titanic-app')

artifact_filename_rf = 'rf_model.pkl'
joblib.dump(rfm, artifact_filename_rf)

model_bucket = 'gs://pmykola-projectsgcp-artifacts/titanic'
storage_path = os.path.join(model_bucket, artifact_filename_rf)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_rf)

In [9]:
file = open(artifact_filename_rf, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[0,:])])
print('rf', prediction)

rf [0]
