In [2]:
import pandas as pd 
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [207]:
from sklearn.decomposition import PCA

In [3]:
rs = 1

In [None]:
def onehot_name(df):
    mrs = '|'.join(['Mrs\.','Mme\.', 'Ms\.', 'Dr\.', 'Lady\.', 'Countess\.'])    
    mr = '|'.join(['Mr\.','Don\.','Rev\.','Dr\.', 'Major\.', 'Sir\.', 'Col\.', 'Capt\.', 'Jonkheer\.'])
    miss = '|'.join(['Miss\.', 'Mlle\.'])
    return df.assign(Name = lambda x : x['Name'].str.replace('\(.*\)', '', regex=True),
                     Mr =  lambda x : x['Name'].str.contains(mr).astype(int) ,
                     Miss = lambda x : x['Name'].str.contains(miss).astype(int),
                     Mrs = lambda x : x['Name'].str.contains(mrs).astype(int),
                     Master = lambda x : (x['Name'].str.find('Master.') > -1).astype(int))

In [None]:
def find_age_cat(x):
    """
    
    """
    if x.Name.lower().find('mr')!=-1:
        x.age_cat = 'older'
    elif x.Name.lower().find('mrs')!=-1:
        x.age_cat = 'older'
    elif x.Name.lower().find('master')!=-1:
        x.age_cat = 'younger'
    else: 
        x.age_cat = 'unknown'
    return x


In [256]:
def read_data(file):
    """
    
    """
    df = (pd.read_csv(file, dtype = {
                            "PassengerId":np.int32,
                            "Name": "object",
                            "Pclass":"object",
                            "Survived":np.int32,
                            "Sex":"object",
                            "Age":np.float,
                            "SibSp":"object",
                            "Embarked":"object",
                            "Cabin":"object",
                            "Fare":np.float64,
                        },)
            .set_index("PassengerId")
            .pipe(onehot_name)
            .drop(['Name','Ticket','Cabin'],axis=1)
        )
    return df
    

In [258]:
filename = "../data/train.csv"
data = read_data(filename)

In [127]:
#data['age_cat'] = np.nan
#data.apply(find_age_cat,axis=1).age_cat.str.contains('unknown').sum()

In [260]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked',
       'Mr', 'Miss', 'Mrs', 'Master'],
      dtype='object')

In [261]:
X_train = data.drop(['Survived'],axis=1) 
y_train = data.Survived

In [262]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['Age','Fare','Mr', 'Miss', 'Mrs', 'Master']
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=0,initial_strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Embarked', 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent',missing_values=np.nan)),
    ('onehot', OneHotEncoder(handle_unknown='error', drop="first"))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
        ])

In [264]:
# Pipe rfr
pipe_rfr = Pipeline([("pre_pros",preprocessor), 
                     ("rfr",RandomForestClassifier(n_estimators=10,random_state=rs)) ])
params_rfr = {"rfr__n_estimators":[1,5,10,50,100,250,500,1000],
              "rfr__max_depth":[2,5,10,20,50,75,150]}

#pipe lr
pipe_lr = Pipeline([('pre_pros',preprocessor),
                     ('pca',PCA()),
                     ('lr',LogisticRegression(random_state=rs))])
params_lr = {'pca__n_components':[1,3,5],
             'lr__C':[0.0001,0.001,0.1,1,10,100,1000]}

#cross validation for inner and outer loopn in nested cv
cv_outer = StratifiedKFold(shuffle=True, random_state = rs)
cv_inner = StratifiedKFold(shuffle=True, random_state = rs)

# grid-search
gs_rf = GridSearchCV(estimator = pipe_rfr,
                  param_grid = params_rfr,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=1)

gs_lr = GridSearchCV(estimator = pipe_lr,
                  param_grid = params_lr,
                  cv = cv_inner,
                  scoring = "accuracy",
                  n_jobs=1)



## Basline

In [265]:
# Dumb baseline
avg_dumb_baseline = np.ones((y_train.shape))*np.rint(y_train.astype(int).mean())
acc_dumb_baseline = (y_train==avg_dumb_baseline).sum()/len(y_train)

#smarter baseline - women survives and men dies
baseline_pred = X_train.Sex.map({'female':1,'male':0})
acc_baseline = (y_train==baseline_pred).sum()/len(y_train)

#print
print(f'Accuracy for dumb baseline {acc_dumb_baseline*100:.2f}%')
print(f'Accuracy for baseline {acc_baseline*100:.2f}%')

Accuracy for dumb baseline 61.62%
Accuracy for baseline 78.68%


## nested cross validations

In [None]:
%%time
nested_score_rfr = cross_val_score(gs_rf,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

In [None]:
nested_score_rfr

In [None]:
%% time 
nested_score_lr = cross_val_score(gs_lr,X_train,y_train,
                                 cv=cv_outer,
                                 scoring='accuracy')

In [None]:
nested_score_lr

## train best model

In [25]:
gs_rf.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

## Sumbission

In [209]:
gs_rf.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_pros',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                      

In [222]:
X_test = read_data("../data/test.csv")

In [223]:
X_test = X_test.pipe(onehot_name)

In [224]:
X_test.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked', 'Mr', 'Miss', 'Mrs', 'Master'],
      dtype='object')

In [225]:
X_test.drop(['Name','Ticket'],axis=1)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Mr,Miss,Mrs,Master
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,3,male,34.5,0,0,7.8292,,Q,1,0,0,0
893,3,female,47.0,1,0,7.0000,,S,0,0,1,0
894,2,male,62.0,0,0,9.6875,,Q,1,0,0,0
895,3,male,27.0,0,0,8.6625,,S,1,0,0,0
896,3,female,22.0,1,1,12.2875,,S,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,male,,0,0,8.0500,,S,1,0,0,0
1306,1,female,39.0,0,0,108.9000,C105,C,0,0,0,0
1307,3,male,38.5,0,0,7.2500,,S,1,0,0,0
1308,3,male,,0,0,8.0500,,S,1,0,0,0


In [226]:
X_train

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Mr,Miss,Mrs,Master
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,3,male,22.0,1,0,7.2500,,S,1,0,0,0
2,1,female,38.0,1,0,71.2833,C85,C,0,0,1,0
3,3,female,26.0,0,0,7.9250,,S,0,1,0,0
4,1,female,35.0,1,0,53.1000,C123,S,0,0,1,0
5,3,male,35.0,0,0,8.0500,,S,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
887,2,male,27.0,0,0,13.0000,,S,1,0,0,0
888,1,female,19.0,0,0,30.0000,B42,S,0,1,0,0
889,3,female,,1,2,23.4500,,S,0,1,0,0
890,1,male,26.0,0,0,30.0000,C148,C,1,0,0,0


In [228]:
predictions = gs_rf.best_estimator_.predict(X_test.drop(['Name','Ticket'],axis=1))

In [231]:
X_test.index

Int64Index([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,
            ...
            1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309],
           dtype='int64', name='PassengerId', length=418)

In [247]:
prediction_baseline = X_test.Sex.map({'female':1,'male':0})
baseline_sub = pd.DataFrame({'PassengerId':X_test.index,'Survived':prediction_baseline})

In [232]:
submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':predictions})

In [248]:
filename = 'baseline_sub.csv'

baseline_sub.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: baseline_sub.csv
