In [1]:
import pandas as pd 
import numpy as np

# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import os

In [4]:
rfr_pred = pd.read_csv('rf_sfs_sub.csv')
svc_pred = pd.read_csv('fist_svc.csv')

In [5]:
(rfr_pred != svc_pred).sum()

PassengerId     0
Survived       37
dtype: int64

In [13]:
rs=1

In [None]:
def get_gs_info(model):
    """
    
    """
    print('Best model score: ' + str(model.best_score_))
    print('Best model params: '+ str(model.best_params_))

In [6]:
def onehot_name(df):
    mrs = '|'.join(['Mrs\.','Mme\.', 'Ms\.', 'Dr\.', 'Lady\.', 'Countess\.'])    
    mr = '|'.join(['Mr\.','Don\.','Rev\.','Dr\.', 'Major\.', 'Sir\.', 'Col\.', 'Capt\.', 'Jonkheer\.'])
    miss = '|'.join(['Miss\.', 'Mlle\.'])
    return df.assign(Name = lambda x : x['name'].str.replace('\(.*\)', '', regex=True),
                     Mr =  lambda x : x['name'].str.contains(mr).astype(int) ,
                     Miss = lambda x : x['name'].str.contains(miss).astype(int),
                     Mrs = lambda x : x['name'].str.contains(mrs).astype(int),
                     Master = lambda x : (x['name'].str.find('Master.') > -1).astype(int))

In [7]:
def feature_egneenering(df):
    """
    
    """
    df = (df.assign(family_size = df.sibsp + df.parch+1,
                    is_alone = lambda x: x.family_size < 2,
                    calc_fare = lambda x: x.fare/x.family_size)
            .astype({'is_alone':int})
           )
    return df


In [8]:
def read_data(file):
    """
    
    """
    df = (pd.read_csv(file, dtype = {
                            "PassengerId":np.int32,
                            "Name": "object",
                            "Pclass":np.int,
                            "Survived":np.int32,
                            "Sex":"object",
                            "Age":np.float,
                            "SibSp":np.int,
                            "Embarked":"object",
                            "Cabin":"object",
                            "Fare":np.float64,
                        },)
            .rename(columns=str.lower)
            .set_index("passengerid")
            .pipe(onehot_name)
            .pipe(feature_egneenering)
            .rename(columns=str.lower)
            .drop(['cabin','name','ticket'],axis=1)
        )
    return df
    

In [9]:
filename = "../data/train.csv"
data = read_data(filename)
X_train = data.drop(['survived'],axis=1) 
y_train = data.survived

In [10]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = X_train.select_dtypes(exclude = 'O').columns.values #['age','fare','mr', 'miss', 'mrs', 'master']
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=0,initial_strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = X_train.select_dtypes(include = 'O').columns.values#['Embarked', 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent',missing_values=np.nan)),
    ('onehot', OneHotEncoder(handle_unknown='error', drop="first"))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
        ])

In [11]:
def make_params(string0,string1,string2,string3=None):
    params = []
    if string3:
        for el2 in string3[1]:
            for el in string2[1]:
                params.append({string1[0]:string1[1],
                              string0[0]+'__'+string2[0]:[el],
                              string0[1]+'__estimator__'+string2[0]:[el],
                              string0[0]+'__'+string3[0]:[el2],
                              string0[1]+'__estimator__'+string3[0]:[el2]})
    else:
        for el in string2[1]:
            params.append({string1[0]:string1[1],
                         string0[0]+'__'+string2[0]:[el],
                         string0[1]+'__estimator__'+string2[0]:[el]})
    return params

In [14]:
no_feat = X_train.shape[1]

#cross validation for inner and outer loopn in nested cv
cv_outer = StratifiedKFold(shuffle=True, random_state = rs)
cv_inner = StratifiedKFold(shuffle=True, random_state = rs)
cv_sfs = StratifiedKFold(shuffle=True, random_state = rs)

In [17]:
## RFR
est = RandomForestClassifier(n_estimators=10,random_state=rs)
sfs1 = SFS(estimator=est, 
           k_features=3,
           forward=True, 
           floating=False, 
           scoring='neg_mean_absolute_error',
           cv=cv_sfs)

pipe_rfr_sfs = Pipeline([('pre_pros',  preprocessor),
                 ('sfs', sfs1),
                 ('rfc',est)])

string1 = ['sfs__k_features', [(2,no_feat)]]
string2 = ['n_estimators',[1,5,10,50,100,250,500,1000]]
string3 = ['max_depth',[2,5,10,20,50,75,150]]
param_grid_rfr_sfs = make_params(['rfc','sfs'],string1,string2,string3)

gs_rf_sfs = GridSearchCV(estimator = pipe_rfr_sfs,
                  param_grid = param_grid_rfr_sfs,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1,
                  verbose=2)

In [40]:
## SVC
est = SVC(probability=True)
sfs2 = SFS(estimator=est, 
           k_features=3,
           forward=True, 
           floating=False, 
           scoring='neg_mean_absolute_error',
           cv=cv_sfs)

pipe_svc_sfs = Pipeline([('pre_pros',  preprocessor),
                 ('sfs', sfs2),
                 ('svc',est)])

string1 = ['sfs__k_features', [(2,no_feat)]]
string2 = ['C', [0.0001,0.001,0.01,0.1,1,10,100,100]]
string3 = ['gamma',[0.0001,0.001,0.01,0.1,1,10,100,100]]
param_grid_svc_sfs = make_params(['svc','sfs'],string1,string2,string3)

gs_svc_sfs = GridSearchCV(estimator = pipe_svc_sfs,
                  param_grid = param_grid_svc_sfs,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1,
                  verbose=2)

In [19]:
%%time
gs_rf_sfs.fit(X_train,y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 21.3min finished


CPU times: user 3.7 s, sys: 210 ms, total: 3.91 s
Wall time: 21min 17s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

In [41]:
%%time
gs_svc_sfs.fit(X_train,y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed: 44.2min finished


CPU times: user 34.4 s, sys: 374 ms, total: 34.8 s
Wall time: 44min 45s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

In [27]:
## KNN
est = KNeighborsClassifier(weights = 'distance')
sfs3 = SFS(estimator=est, 
           k_features=3,
           forward=True, 
           floating=False, 
           scoring='neg_mean_absolute_error',
           cv=cv_sfs)

pipe_knn_sfs = Pipeline([('pre_pros',  preprocessor),
                 ('sfs', sfs3),
                 ('knn',est)])

string1 = ['sfs__k_features', [(2,no_feat)]]
string2 = ['n_neighbors', [1,2,3,5,10,20,30]]
string3 = ['p',[1,2,3]]
param_grid_knn_sfs = make_params(['knn','sfs'],string1,string2,string3)

gs_knn_sfs = GridSearchCV(estimator = pipe_knn_sfs,
                  param_grid = param_grid_knn_sfs,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=-1,
                  verbose=2)

In [28]:
%%time
gs_knn_sfs.fit(X_train,y_train)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:  1.6min finished


CPU times: user 2.84 s, sys: 96 ms, total: 2.94 s
Wall time: 1min 37s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

In [30]:
get_gs_info(gs_knn_sfs)

Best model score: 0.8339087314041805
Best model params: {'knn__n_neighbors': 20, 'knn__p': 1, 'sfs__estimator__n_neighbors': 20, 'sfs__estimator__p': 1, 'sfs__k_features': (2, 14)}


In [42]:
get_gs_info(gs_svc_sfs) # with prob

Best model score: 0.8271483271608814
Best model params: {'sfs__estimator__C': 10, 'sfs__estimator__gamma': 0.01, 'sfs__k_features': (2, 14), 'svc__C': 10, 'svc__gamma': 0.01}


In [25]:
get_gs_info(gs_svc_sfs) # without prob

Best model score: 0.8271483271608814
Best model params: {'sfs__estimator__C': 10, 'sfs__estimator__gamma': 0.01, 'sfs__k_features': (2, 14), 'svc__C': 10, 'svc__gamma': 0.01}


In [24]:
get_gs_info(gs_rf_sfs)

Best model score: 0.8226727763480008
Best model params: {'rfc__max_depth': 5, 'rfc__n_estimators': 1, 'sfs__estimator__max_depth': 5, 'sfs__estimator__n_estimators': 1, 'sfs__k_features': (2, 14)}


In [43]:
knn_model = gs_knn_sfs.best_estimator_
rf_model = gs_rf_sfs.best_estimator_
svc_model = gs_svc_sfs.best_estimator_

estimators=[('knn', knn_model), ('rf', rf_model), ('svc', svc_model)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')


In [44]:
%%time
ensamble_score = cross_val_score(ensemble,X_train,y_train,cv = cv_inner,n_jobs=-1)
ensamble_score

CPU times: user 36 ms, sys: 5.06 ms, total: 41 ms
Wall time: 1min 3s


array([0.81564246, 0.85393258, 0.80898876, 0.81460674, 0.85955056])

In [38]:
%%time
ensemble.fit(X_train,y_train)

VotingClassifier(estimators=[('knn',
                              Pipeline(memory=None,
                                       steps=[('pre_pros',
                                               ColumnTransformer(n_jobs=None,
                                                                 remainder='drop',
                                                                 sparse_threshold=0.3,
                                                                 transformer_weights=None,
                                                                 transformers=[('cat',
                                                                                Pipeline(memory=None,
                                                                                         steps=[('imputer',
                                                                                                 SimpleImputer(add_indicator=False,
                                                                                    

In [35]:
def submit(model,filename):
    """
    
    """
    X_test = read_data("../data/test.csv")
    predictions = model.predict(X_test)
    submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':predictions})
    submission.to_csv(filename,index=False)
    print('Saved file: ' + filename)
    return

In [39]:
submit(ensemble,'ensamble_first.csv')

Saved file: ensamble_first.csv
