In [1]:
import pandas as pd 
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.svm import SVC


In [2]:
rs = 1

In [3]:
def onehot_name(df):
    mrs = '|'.join(['Mrs\.','Mme\.', 'Ms\.', 'Dr\.', 'Lady\.', 'Countess\.'])    
    mr = '|'.join(['Mr\.','Don\.','Rev\.','Dr\.', 'Major\.', 'Sir\.', 'Col\.', 'Capt\.', 'Jonkheer\.'])
    miss = '|'.join(['Miss\.', 'Mlle\.'])
    return df.assign(Name = lambda x : x['Name'].str.replace('\(.*\)', '', regex=True),
                     Mr =  lambda x : x['Name'].str.contains(mr).astype(int) ,
                     Miss = lambda x : x['Name'].str.contains(miss).astype(int),
                     Mrs = lambda x : x['Name'].str.contains(mrs).astype(int),
                     Master = lambda x : (x['Name'].str.find('Master.') > -1).astype(int))

In [4]:
def read_data(file):
    """
    
    """
    df = (pd.read_csv(file, dtype = {
                            "PassengerId":np.int32,
                            "Name": "object",
                            "Pclass":"object",
                            "Survived":np.int32,
                            "Sex":"object",
                            "Age":np.float,
                            "SibSp":"object",
                            "Embarked":"object",
                            "Cabin":"object",
                            "Fare":np.float64,
                        },)
            .set_index("PassengerId")
            .pipe(onehot_name)
            .drop(['Name','Ticket','Cabin'],axis=1)
        )
    return df
    

In [5]:
filename = "../data/train.csv"
data = read_data(filename)

In [6]:
#data['age_cat'] = np.nan
#data.apply(find_age_cat,axis=1).age_cat.str.contains('unknown').sum()

In [7]:
X_train = data.drop(['Survived'],axis=1) 
y_train = data.Survived

In [8]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['Age','Fare','Mr', 'Miss', 'Mrs', 'Master']
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=0,initial_strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Embarked', 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent',missing_values=np.nan)),
    ('onehot', OneHotEncoder(handle_unknown='error', drop="first"))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
        ])

In [9]:
def make_params(string0,string1,string2,string3=None):
    params = []
    if string3:
        for el2 in string3[1]:
            for el in string2[1]:
                params.append({string1[0]:string1[1],
                              string0[0]+'__'+string2[0]:[el],
                              string0[1]+'__estimator__'+string2[0]:[el],
                              string0[0]+'__'+string3[0]:[el2],
                              string0[1]+'__estimator__'+string3[0]:[el2]})
    else:
        for el in string2[1]:
            params.append({string1[0]:string1[1],
                         string0[0]+'__'+string2[0]:[el],
                         string0[1]+'__estimator__'+string2[0]:[el]})
    return params

In [10]:
no_feat = X_train.shape[1]

#cross validation for inner and outer loopn in nested cv
cv_outer = StratifiedKFold(shuffle=True, random_state = rs)
cv_inner = StratifiedKFold(shuffle=True, random_state = rs)
cv_sfs = StratifiedKFold(shuffle=True, random_state = rs)

In [None]:
## LR
est = LogisticRegression(random_state=rs)
sfs1 = SFS(estimator=est, 
           forward=True, 
           floating=False, 
           scoring='accuracy',
           cv=cv_sfs)

pipe_lr_sfs = Pipeline([('pre_pros',  preprocessor)
                 ('sfs', sfs1),
                 ('lr',est)])

string1 = ['sfs__k_features', [(2,no_feat)]]
string2 = ['lr__C', [0.0001,0.001,0.01,0.1,1,10,100,100]]
param_grid_lr_sfs = make_params(['lr','sfs'],string1,string2)

gs_lr_sfs = GridSearchCV(estimator = pipe_lr_sfs,
                  param_grid = param_grid_lr_sfs,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1,
                  verbose=2)

In [21]:
## RFR
est = RandomForestClassifier(n_estimators=10,random_state=rs)
sfs1 = SFS(estimator=est, 
           forward=True, 
           floating=False, 
           scoring='accuracy',
           cv=cv_sfs)

pipe_rfr_sfs = Pipeline([('pre_pros',  preprocessor),
                 ('sfs', sfs1),
                 ('rfc',est)])

string1 = ['sfs__k_features', [(2,no_feat)]]
string2 = ['n_estimators',[1,5,10,50,100,250,500,1000]]
string3 = ['max_depth',[2,5,10,20,50,75,150]]
param_grid_rfr_sfs = make_params(['rfc','sfs'],string1,string2,string3)

gs_rf_sfs = GridSearchCV(estimator = pipe_rfr_sfs,
                  param_grid = param_grid_rfr_sfs,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1,
                  verbose=2)

In [13]:
## SVC
est = SVC()
sfs1 = SFS(estimator=est, 
           forward=True, 
           floating=False, 
           scoring='accuracy',
           cv=cv_sfs)

pipe_svc_sfs = Pipeline([('pre_pros',  preprocessor),
                 ('sfs', sfs1),
                 ('svc',est)])

string1 = ['sfs__k_features', [(2,no_feat)]]
string2 = ['C', [0.0001,0.001,0.01,0.1,1,10,100,100]]
string3 = ['gamma',[0.0001,0.001,0.01,0.1,1,10,100,100]]
param_grid_svc_sfs = make_params(['svc','sfs'],string1,string2,string3)

gs_svc_sfs = GridSearchCV(estimator = pipe_svc_sfs,
                  param_grid = param_grid_svc_sfs,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1,
                  verbose=2)

## Nested cross validation score

### RFR with Sequential Forward Selection

In [22]:
%%time
nested_score_rfr = cross_val_score(gs_rf_sfs,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 15.4min finished


Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 14.4min finished


Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 13.6min finished


Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 13.5min finished


Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 15.1min finished


CPU times: user 5min 13s, sys: 5.87 s, total: 5min 19s
Wall time: 1h 17min 31s


### SVC with Sequential Forward Selection

In [24]:
%%time
nested_score_svc = cross_val_score(gs_svc_sfs,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   39.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  5.3min finished


Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  5.2min finished


Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  5.9min finished


Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  5.7min finished


Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  8.9min finished


CPU times: user 48 s, sys: 941 ms, total: 49 s
Wall time: 31min 45s


### Logistic regression

In [None]:
%%time
nested_score_lr = cross_val_score(gs_lr_sfs,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

### Baseline

In [None]:
# Dumb baseline
avg_dumb_baseline = np.ones((y_train.shape))*np.rint(y_train.astype(int).mean())
acc_dumb_baseline = (y_train==avg_dumb_baseline).sum()/len(y_train)

#smarter baseline - women survives and men dies
baseline_pred = X_train.Sex.map({'female':1,'male':0})
acc_baseline = (y_train==baseline_pred).sum()/len(y_train)

#print
print(f'Accuracy for dumb baseline {acc_dumb_baseline*100:.2f}%')
print(f'Accuracy for baseline {acc_baseline*100:.2f}%')

In [68]:
get_gs_info(gs_rf_sfs)

Best model score: 0.8451446864603603
Best model params: {'rfc__max_depth': 9, 'rfc__n_estimators': 12, 'sfs__estimator__max_depth': 9, 'sfs__estimator__n_estimators': 12, 'sfs__k_features': (2, 11)}


In [67]:
def get_gs_info(model):
    """
    
    """
    print('Best model score: ' + str(model.best_score_))
    print('Best model params: '+ str(model.best_params_))

In [71]:
model = gs_rf_sfs.best_estimator_.fit(X_train,y_train)

In [72]:
submit(model,'two_more.csv')

Saved file: two_more.csv


In [14]:
import os
#os.system(command)

In [16]:
os.system('ls')

0

In [69]:
def submit(model,filename):
    """
    
    """
    X_test = read_data("../data/test.csv")
    predictions = model.predict(X_test)
    submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':predictions})
    submission.to_csv(filename,index=False)
    print('Saved file: ' + filename)
    return

In [42]:
%%time
gs_rf_sfs.fit(X_train,y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 14.4min finished


CPU times: user 4min 34s, sys: 4.17 s, total: 4min 38s
Wall time: 19min 18s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

In [43]:
gs_rf_sfs.best_score_

0.858602724248321

In [24]:
# Pipe rfr
pipe_rfr = Pipeline([("pre_pros",preprocessor), 
                     ("rfr",RandomForestClassifier(n_estimators=10,random_state=rs)) ])
params_rfr = {"rfr__n_estimators":[1,5,10,50,100,250,500,1000],
              "rfr__max_depth":[2,5,10,20,50,75,150]}





## Basline

In [25]:
# Dumb baseline
avg_dumb_baseline = np.ones((y_train.shape))*np.rint(y_train.astype(int).mean())
acc_dumb_baseline = (y_train==avg_dumb_baseline).sum()/len(y_train)

#smarter baseline - women survives and men dies
baseline_pred = X_train.Sex.map({'female':1,'male':0})
acc_baseline = (y_train==baseline_pred).sum()/len(y_train)

#print
print(f'Accuracy for dumb baseline {acc_dumb_baseline*100:.2f}%')
print(f'Accuracy for baseline {acc_baseline*100:.2f}%')

Accuracy for dumb baseline 61.62%
Accuracy for baseline 78.68%


## nested cross validations

In [26]:
%%time
nested_score_rfr = cross_val_score(gs_rf,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

CPU times: user 12min 26s, sys: 10.8 s, total: 12min 37s
Wall time: 12min 8s


In [27]:
nested_score_rfr

array([0.80446927, 0.83707865, 0.7752809 , 0.8258427 , 0.86516854])

In [28]:
%%time 
nested_score_lr = cross_val_score(gs_lr,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

CPU times: user 2.68 s, sys: 229 ms, total: 2.91 s
Wall time: 41min 14s


In [29]:
nested_score_lr

array([0.80446927, 0.78651685, 0.79213483, 0.76404494, 0.79775281])

In [30]:
#pipe lr
pipe_lr_k = Pipeline([('pre_pros',preprocessor),
                     ('inc_spac',PolynomialFeatures()),
                     ('pca',PCA()),
                     ('lr',LogisticRegression(random_state=rs))])
params_lr_k = {
             'inc_spac__degree':[1,2,5,10],
             'pca__n_components':[1,3,5],
             'lr__C':[0.0001,0.001,0.1,1,10,100,1000]}

#cross validation for inner and outer loopn in nested cv
cv_outer = StratifiedKFold(shuffle=True, random_state = rs)
cv_inner = StratifiedKFold(shuffle=True, random_state = rs)

# grid-search


gs_lr_k = GridSearchCV(estimator = pipe_lr_k,
                  param_grid = params_lr_k,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=2)



In [None]:
%%time 
nested_score_lr_k = cross_val_score(gs_lr_k,X_train,y_train,
                                    cv=cv_outer,
                                    scoring='accuracy',
                                    verbose=2,
                                    n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


## train best model

In [271]:
gs_lr.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

## Sumbission

In [45]:
X_test = read_data("../data/test.csv")

In [46]:
X_test.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Mr',
       'Miss', 'Mrs', 'Master'],
      dtype='object')

In [44]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Mr',
       'Miss', 'Mrs', 'Master'],
      dtype='object')

In [48]:
predictions = gs_rf_sfs.best_estimator_.predict(X_test)

prediction_baseline = X_test.Sex.map({'female':1,'male':0})
baseline_sub = pd.DataFrame({'PassengerId':X_test.index,'Survived':prediction_baseline})

In [49]:
submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':predictions})

In [51]:
filename = 'rf_sfs_sub.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: rf_sfs_sub.csv
