In [48]:
# https://jorisvandenbossche.github.io/blog/2018/05/28/scikit-learn-columntransformer/

In [10]:
import pandas as pd
import numpy as np

df = pd.read_csv('./titanic_data/train.csv')
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [11]:
def drop_add_unwanted_columns(df_):
    # drop less useful columns
    df_.drop(['Name','Ticket','Cabin'],axis='columns', inplace=True)

    # combine & drop
    df_['family_mem_count'] = df_.SibSp + df_.Parch
    df_.drop(['SibSp','Parch'],axis='columns', inplace=True)
    
    return df_;
    
def convert_categorial_columns(df_):
    df_['Sex'] = df_['Sex'].astype('category')
    df_['Pclass'] = df_['Pclass'].astype('category')
    df_['Embarked'] = df_['Embarked'].astype('category')
    if 'Survived' in list(df_.columns):
        df_['Survived'] = df_['Survived'].astype('category')
    return df_;

def fill_missing_values(df_):
    df_ = df_.fillna({'Embarked': df_.Embarked.value_counts(dropna=False).index[0], 'Age': np.mean(df_.Age), 'Fare': np.mean(df_.Fare)})
    df_.Embarked.value_counts(dropna=False)
    return df_;

def get_feature_target_columns(df_):
    X = df_[['PassengerId', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked',
       'family_mem_count']]
    y = df_.Survived
    
    return X,y

In [12]:
df = drop_add_unwanted_columns(df)
df = convert_categorial_columns(df)
df = fill_missing_values(df)

df.isnull().sum().sum()

0

In [13]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

column_transf = make_column_transformer(
    #(StandardScaler(), ['Age','Fare']),
    (OneHotEncoder(), ['Pclass','Sex','Embarked']), 
    remainder='passthrough')

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, random_state=1)
pipe = make_pipeline(column_transf, clf)

X, y = get_feature_target_columns(df)

# sorted(pipe.get_params().keys())[len(pipe.get_params().keys())-20:]

from sklearn.model_selection import GridSearchCV

params = {'randomforestclassifier__n_estimators': np.arange(10,350,5),
         'randomforestclassifier__criterion' :['gini', 'entropy']}
grid_cv = GridSearchCV(pipe, param_grid=params, scoring='accuracy', cv=5)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

grid_cv.fit(X_train,y_train)

grid_cv.best_params_, grid_cv.best_score_

({'randomforestclassifier__criterion': 'entropy',
  'randomforestclassifier__n_estimators': 135},
 0.8170594837261503)

In [None]:
y_pred = grid_cv.predict(X_test)
print('score:',grid_cv.score(X_test, y_test))

In [41]:
df_test = pd.read_csv('./titanic_data/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [43]:
df_test = pd.read_csv('./titanic_data/test.csv')

df_test = drop_add_unwanted_columns(df_test)
df_test = convert_categorial_columns(df_test)
df_test = fill_missing_values(df_test)

df_test.isnull().sum()

0

In [44]:
test_y_predicted = grid_cv.predict(df_test)

submission_data = {'PassengerId':list(df_test.PassengerId), 'Survived':list(test_y_predicted)}

submission_df = pd.DataFrame(submission_data)
print(submission_df.head())

submission_df.sort_values('PassengerId',ascending=True, inplace=True)
submission_df.to_csv('titanic_submission.csv',index=False)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [47]:
import joblib as jbl

jbl.dump(grgrid_cvv, 'model_logi')

model_logi = jbl.load('model_logi')
model_logi.best_score_

['model_logi']