In [70]:
# https://jorisvandenbossche.github.io/blog/2018/05/28/scikit-learn-columntransformer/

In [7]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, Imputer, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('./titanic_data/train.csv')

# drop less useful columns
df.drop(['Name','Ticket','Cabin'],axis='columns', inplace=True)

# combine & drop
df['family_mem_count'] = df.SibSp + df.Parch
df.drop(['SibSp','Parch'],axis='columns', inplace=True)


In [12]:
df['Sex'] = df['Sex'].astype('category')
df['Pclass'] = df['Pclass'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
df['Survived'] = df['Survived'].astype('category')

df = df.fillna({'Embarked':'S', 'Age':np.mean(df.Age)})

numerical_columns = ['Age','Fare']
categorical_columns = ['Pclass','Sex','Embarked']

numerical_columns, categorical_columns

(['Age', 'Fare'], ['Pclass', 'Sex', 'Embarked'])

In [25]:
colomn_trans_logi = make_column_transformer(
    (StandardScaler(), numerical_columns),
    (OneHotEncoder(), categorical_columns),remainder='passthrough')

pipe_logi = make_pipeline(colomn_trans_logi, LogisticRegression(solver='lbfgs', max_iter=500))

X = df.drop('Survived', axis='columns')
y = df.Survived

In [26]:
param_grid = {
    'logisticregression__C': [0.1, 1.0, 1.0]
}

grid_cv = GridSearchCV(pipe_logi, param_grid=param_grid, scoring='accuracy', cv=5)

In [27]:
grid_cv.fit(X,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('standardscaler',
                                                                         StandardScaler(copy=True,
                                                                                        with_mean=True,
                                                                                        with_std=True),
                                                                         ['Age',
                                             

In [29]:
grid_cv.score(X,y)

0.8047138047138047

In [30]:
grid_cv.best_params_

{'logisticregression__C': 0.1}

In [31]:
grid_cv.best_score_

0.8058361391694725

In [36]:
df_test = pd.read_csv('./titanic_data/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [37]:
df_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [39]:
df_test['family_mem_count'] = df_test.SibSp + df_test.Parch
df_test.drop(['SibSp','Parch'],axis='columns', inplace=True)

In [42]:
df_test.drop(['Name','Ticket','Cabin'], axis='columns', inplace=True)

In [44]:
df_test.isnull().sum()

PassengerId          0
Pclass               0
Sex                  0
Age                 86
Fare                 1
Embarked             0
family_mem_count     0
dtype: int64

In [45]:
df_test = df_test.fillna({'Age':np.mean(df_test.Age), 'Fare': np.mean(df_test.Fare)})

In [46]:
df_test.isnull().sum()

PassengerId         0
Pclass              0
Sex                 0
Age                 0
Fare                0
Embarked            0
family_mem_count    0
dtype: int64

In [48]:
test_y_predicted = grid_cv.predict(df_test)

In [55]:
submission_data = {'PassengerId':list(df_test.PassengerId), 'Survived':list(test_y_predicted)}

submission_df = pd.DataFrame(submission_data)
submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [65]:
submission_df.sort_values('PassengerId',ascending=True, inplace=True)
submission_df.to_csv('titanic_submission.csv',index=False)

In [68]:
import joblib as jbl

jbl.dump(grid_cv, 'model_logi')

['model_logi']

In [69]:
model_logi = jbl.load('model_logi')
model_logi.best_score_

0.8058361391694725