In [1]:
import pandas as pd 
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

from mlxtend.feature_selection import SequentialFeatureSelector as SFS



In [2]:
rs = 1

In [6]:
def onehot_name(df):
    mrs = '|'.join(['Mrs\.','Mme\.', 'Ms\.', 'Dr\.', 'Lady\.', 'Countess\.'])    
    mr = '|'.join(['Mr\.','Don\.','Rev\.','Dr\.', 'Major\.', 'Sir\.', 'Col\.', 'Capt\.', 'Jonkheer\.'])
    miss = '|'.join(['Miss\.', 'Mlle\.'])
    return df.assign(Name = lambda x : x['name'].str.replace('\(.*\)', '', regex=True),
                     Mr =  lambda x : x['name'].str.contains(mr).astype(int) ,
                     Miss = lambda x : x['name'].str.contains(miss).astype(int),
                     Mrs = lambda x : x['name'].str.contains(mrs).astype(int),
                     Master = lambda x : (x['name'].str.find('Master.') > -1).astype(int))

In [3]:
def feature_egneenering(df):
    """
    
    """
    df = (df.assign(family_size = df.sibsp + df.parch+1,
                    is_alone = lambda x: x.family_size < 2,
                    calc_fare = lambda x: x.fare/x.family_size)
            .astype({'is_alone':int})
           )
    return df


In [53]:
def read_data(file):
    """
    
    """
    df = (pd.read_csv(file, dtype = {
                            "PassengerId":np.int32,
                            "Name": "object",
                            "Pclass":np.int,
                            "Survived":np.int32,
                            "Sex":"object",
                            "Age":np.float,
                            "SibSp":np.int,
                            "Embarked":"object",
                            "Cabin":"object",
                            "Fare":np.float64,
                        },)
            .rename(columns=str.lower)
            .set_index("passengerid")
            .pipe(onehot_name)
            .pipe(feature_egneenering)
            .rename(columns=str.lower)
            .drop(['cabin','name','ticket'],axis=1)
        )
    return df
    

In [54]:
filename = "../data/train.csv"
data = read_data(filename)

In [55]:
X_train = data.drop(['survived'],axis=1) 
y_train = data.survived

In [59]:
X_train.dtypes

pclass           int64
sex             object
age            float64
sibsp            int64
parch            int64
fare           float64
embarked        object
mr               int64
miss             int64
mrs              int64
master           int64
family_size      int64
is_alone         int64
calc_fare      float64
dtype: object

array(['pclass', 'age', 'sibsp', 'parch', 'fare', 'mr', 'miss', 'mrs',
       'master', 'family_size', 'is_alone', 'calc_fare'], dtype=object)

In [57]:
X_train.select_dtypes('O')

Unnamed: 0_level_0,sex,embarked
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S
4,female,S
5,male,S
...,...,...
887,male,S
888,female,S
889,female,S
890,male,C


In [70]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = X_train.select_dtypes(exclude = 'O').columns.values #['age','fare','mr', 'miss', 'mrs', 'master']
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=0,initial_strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = X_train.select_dtypes(include = 'O').columns.values#['Embarked', 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent',missing_values=np.nan)),
    ('onehot', OneHotEncoder(handle_unknown='error', drop="first"))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
        ])

In [72]:
def make_params(string0,string1,string2,string3=None):
    params = []
    if string3:
        for el2 in string3[1]:
            for el in string2[1]:
                params.append({string1[0]:string1[1],
                              string0[0]+'__'+string2[0]:[el],
                              string0[1]+'__estimator__'+string2[0]:[el],
                              string0[0]+'__'+string3[0]:[el2],
                              string0[1]+'__estimator__'+string3[0]:[el2]})
    else:
        for el in string2[1]:
            params.append({string1[0]:string1[1],
                         string0[0]+'__'+string2[0]:[el],
                         string0[1]+'__estimator__'+string2[0]:[el]})
    return params

In [73]:
no_feat = X_train.shape[1]

#cross validation for inner and outer loopn in nested cv
cv_outer = StratifiedKFold(shuffle=True, random_state = rs)
cv_inner = StratifiedKFold(shuffle=True, random_state = rs)
cv_sfs = StratifiedKFold(shuffle=True, random_state = rs)

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [82]:
## RFR
est = RandomForestClassifier(n_estimators=10,random_state=rs)
sfs1 = SFS(estimator=est, 
           k_features=3,
           forward=True, 
           floating=False, 
           scoring='neg_mean_absolute_error',
           cv=cv_sfs)

pipe_rfr_sfs = Pipeline([('pre_pros',  preprocessor),
                 ('sfs', sfs1),
                 ('rfc',est)])

string1 = ['sfs__k_features', [(2,no_feat)]]
string2 = ['n_estimators',list(range(1,10))]#[1,5,10,50,100,250,500,]]
string3 = ['max_depth',list(range(1,10))]#[2,5,10,20,50,75,150]]
param_grid_rfr_sfs = make_params(['rfc','sfs'],string1,string2,string3)

gs_rf_sfs = GridSearchCV(estimator = pipe_rfr_sfs,
                  param_grid = param_grid_rfr_sfs,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1,
                  verbose=2)

In [83]:
%%time
gs_rf_sfs.fit(X_train,y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 28.6min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 32.5min finished


CPU times: user 11.4 s, sys: 327 ms, total: 11.8 s
Wall time: 32min 37s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

In [84]:
gs_rf_sfs.best_score_

0.8271734354403364

In [85]:
gs_rf_sfs.best_params_


{'rfc__max_depth': 5,
 'rfc__n_estimators': 9,
 'sfs__estimator__max_depth': 5,
 'sfs__estimator__n_estimators': 9,
 'sfs__k_features': (2, 14)}

In [86]:
submit(gs_rf_sfs.best_estimator_,'fine_tuned_rf_sfs.csv')

Saved file: fine_tuned_rf_sfs.csv


In [80]:
gs_rf_sfs.best_params_

{'rfc__max_depth': 5,
 'rfc__n_estimators': 1,
 'sfs__estimator__max_depth': 5,
 'sfs__estimator__n_estimators': 1,
 'sfs__k_features': (2, 14)}

In [78]:
def submit(model,filename):
    """
    
    """
    X_test = read_data("../data/test.csv")
    predictions = model.predict(X_test)
    submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':predictions})
    submission.to_csv(filename,index=False)
    print('Saved file: ' + filename)
    return

In [79]:
submit(gs_rf_sfs.best_estimator_,"new_features_rf_sfs.csv")

Saved file: new_features_rf_sfs.csv


In [77]:
X_test = read_data("../data/test.csv")
predictions = gs_rf_sfs.best_estimator_.predict(X_test)
submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':predictions})
filename = 'rf_sfs_sub.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [42]:
%%time
gs_rf_sfs.fit(X_train,y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 14.4min finished


CPU times: user 4min 34s, sys: 4.17 s, total: 4min 38s
Wall time: 19min 18s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

In [43]:
gs_rf_sfs.best_score_

0.858602724248321

In [24]:
# Pipe rfr
pipe_rfr = Pipeline([("pre_pros",preprocessor), 
                     ("rfr",RandomForestClassifier(n_estimators=10,random_state=rs)) ])
params_rfr = {"rfr__n_estimators":[1,5,10,50,100,250,500,1000],
              "rfr__max_depth":[2,5,10,20,50,75,150]}

#pipe lr
pipe_lr = Pipeline([('pre_pros',preprocessor),
                     ('pca',PCA()),
                     ('lr',LogisticRegression(random_state=rs))])
params_lr = {'pca__n_components':[1,3,5],
             'lr__C':[0.0001,0.001,0.1,1,10,100,1000]}

#cross validation for inner and outer loopn in nested cv
cv_outer = StratifiedKFold(shuffle=True, random_state = rs)
cv_inner = StratifiedKFold(shuffle=True, random_state = rs)
cv_sfs = StratifiedKFold(shuffle=True, random_state = rs)

# grid-search
gs_rf = GridSearchCV(estimator = pipe_rfr,
                  param_grid = params_rfr,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1)

gs_lr = GridSearchCV(estimator = pipe_lr,
                  param_grid = params_lr,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1)



## Basline

In [25]:
# Dumb baseline
avg_dumb_baseline = np.ones((y_train.shape))*np.rint(y_train.astype(int).mean())
acc_dumb_baseline = (y_train==avg_dumb_baseline).sum()/len(y_train)

#smarter baseline - women survives and men dies
baseline_pred = X_train.Sex.map({'female':1,'male':0})
acc_baseline = (y_train==baseline_pred).sum()/len(y_train)

#print
print(f'Accuracy for dumb baseline {acc_dumb_baseline*100:.2f}%')
print(f'Accuracy for baseline {acc_baseline*100:.2f}%')

Accuracy for dumb baseline 61.62%
Accuracy for baseline 78.68%


## nested cross validations

In [26]:
%%time
nested_score_rfr = cross_val_score(gs_rf,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

CPU times: user 12min 26s, sys: 10.8 s, total: 12min 37s
Wall time: 12min 8s


In [27]:
nested_score_rfr

array([0.80446927, 0.83707865, 0.7752809 , 0.8258427 , 0.86516854])

In [28]:
%%time 
nested_score_lr = cross_val_score(gs_lr,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

CPU times: user 2.68 s, sys: 229 ms, total: 2.91 s
Wall time: 41min 14s


In [29]:
nested_score_lr

array([0.80446927, 0.78651685, 0.79213483, 0.76404494, 0.79775281])

In [30]:
#pipe lr
pipe_lr_k = Pipeline([('pre_pros',preprocessor),
                     ('inc_spac',PolynomialFeatures()),
                     ('pca',PCA()),
                     ('lr',LogisticRegression(random_state=rs))])
params_lr_k = {
             'inc_spac__degree':[1,2,5,10],
             'pca__n_components':[1,3,5],
             'lr__C':[0.0001,0.001,0.1,1,10,100,1000]}

#cross validation for inner and outer loopn in nested cv
cv_outer = StratifiedKFold(shuffle=True, random_state = rs)
cv_inner = StratifiedKFold(shuffle=True, random_state = rs)

# grid-search


gs_lr_k = GridSearchCV(estimator = pipe_lr_k,
                  param_grid = params_lr_k,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=2)



In [None]:
%%time 
nested_score_lr_k = cross_val_score(gs_lr_k,X_train,y_train,
                                    cv=cv_outer,
                                    scoring='accuracy',
                                    verbose=2,
                                    n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


## train best model

In [271]:
gs_lr.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

## Sumbission

In [45]:
X_test = read_data("../data/test.csv")

In [46]:
X_test.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Mr',
       'Miss', 'Mrs', 'Master'],
      dtype='object')

In [44]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Mr',
       'Miss', 'Mrs', 'Master'],
      dtype='object')

In [48]:
predictions = gs_rf_sfs.best_estimator_.predict(X_test)

prediction_baseline = X_test.Sex.map({'female':1,'male':0})
baseline_sub = pd.DataFrame({'PassengerId':X_test.index,'Survived':prediction_baseline})

In [49]:
submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':predictions})

In [51]:
filename = 'rf_sfs_sub.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: rf_sfs_sub.csv
