## Training Model

In [40]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [27]:
data_dir = "../data/"

In [29]:
def data_cleaning(save_df=False):
    # data load
    df = pd.read_csv(data_dir + "raw/phpMYEkMl.csv")

    # replace char
    df = df.replace("?", np.nan)

    # drop columns
    drop_cols = ["name", "sibsp", "parch", "ticket", "cabin", "boat", "body", "home.dest"]
    df = df.drop(columns=drop_cols)

    # clean columns
    df["sex"] = np.where(df["sex"]=="female", 0, 1)
    df["sex"] = df["sex"].astype("int64")

    df["age"] = df["age"].astype("float")
    df["age"] = df["age"].fillna(value=df["age"].mean())
    df["age"] = round(df["age"],0)
    df["age"] = df["age"].astype("int64")

    df["fare"] = df["fare"].astype("float")
    df["fare"] = df["fare"].fillna(value=df["fare"].mean())
    df["fare"] = round(df["fare"],2)

    df["embarked"] = df["embarked"].fillna(method='ffill').fillna(method='bfill')
    df["embarked"] = df["embarked"].replace("S", 1)
    df["embarked"] = df["embarked"].replace("C", 2)
    df["embarked"] = df["embarked"].replace("Q", 3)    
    df["embarked"] = df["embarked"].astype("int64")
    
    # save dataframe
    if save_df == True:
        df.to_csv(data_dir + "clean/titanic.csv", index=False)

    # return dataframe cleaned
    return df

In [30]:
df = data_cleaning()
df

Unnamed: 0,pclass,survived,sex,age,fare,embarked
0,1,1,0,29,211.34,1
1,1,1,1,1,151.55,1
2,1,0,0,2,151.55,1
3,1,0,1,30,151.55,1
4,1,0,0,25,151.55,1
...,...,...,...,...,...,...
1304,3,0,0,14,14.45,2
1305,3,0,0,30,14.45,2
1306,3,0,1,26,7.22,2
1307,3,0,1,27,7.22,2


In [32]:
seed_split = 43
test_size = 0.20
seed_model = 44

In [31]:
X = df.drop('survived', axis=1)
y = df['survived']


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed_split)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1047, 5), (1047,), (262, 5), (262,))

In [36]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

In [38]:
model = LogisticRegression(class_weight='balanced', random_state=seed_model)
model.fit(X_train, y_train)

LogisticRegression(C=0.0005, class_weight='balanced', random_state=44)

In [41]:
for s, t in zip(['train','test'],[(X_train, y_train),(X_test,y_test)]):
    x, y = t[0], t[1]
    class_pred = model.predict(x)
    proba_pred = model.predict_proba(x)[:,1]
    print('{} roc-auc : {}'.format(s, roc_auc_score(y, proba_pred)))
    print('{} accuracy: {}'.format(s, accuracy_score(y, class_pred)))
    print()

train roc-auc : 0.8278259396701724
train accuracy: 0.7812798471824259

test roc-auc : 0.844754016064257
test accuracy: 0.7824427480916031

