# Titanic - Machine Learning from Disaster
### Start here! Predict survival on the Titanic and get familiar with ML basics

https://www.kaggle.com/c/titanic

# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from eli5 import show_weights
from eli5.sklearn import PermutationImportance

from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings(action='ignore')

# EDA

In [2]:
# All Data
df = pd.read_csv('train.csv')

In [3]:
# Count NaN
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# Cardinality
df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [5]:
# Split data into train, val and X, y
train, val = train_test_split(df, random_state=0)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = ['Survived']

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

X_test = pd.read_csv('test.csv')[features]

# Feature Importance

In [6]:
preprocessing = make_pipeline(OrdinalEncoder(), SimpleImputer())

X_train_processed = preprocessing.fit_transform(X_train)
X_val_processed = preprocessing.transform(X_val)

model = XGBClassifier(eval_metric='logloss')
model.fit(X_train_processed, y_train)

permuter = PermutationImportance(model, scoring='accuracy')
permuter.fit(X_val_processed, y_val)

feature_names = X_val.columns.tolist()

show_weights(
    permuter,
    top=None,
    feature_names=feature_names
)

Weight,Feature
0.1623  ± 0.0328,Sex
0.0762  ± 0.0428,Pclass
0.0682  ± 0.0297,Age
0.0574  ± 0.0243,Fare
0.0215  ± 0.0132,Parch
0.0197  ± 0.0122,Embarked
0.0081  ± 0.0105,SibSp


# Tuning

In [7]:
pipe = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(eval_metric='logloss')
)


dist = {
    'xgbclassifier__n_estimators': range(50, 150, 20),
    'xgbclassifier__max_depth': range(1, 6),
    'xgbclassifier__learning_rate': [0.1, 0.2, 0.3, 0.4]
}

clf = GridSearchCV(
    pipe,
    param_grid=dist,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

clf.fit(X_train, y_train)

print('최적 하이퍼파라미터: ', clf.best_params_)
print('Score: ', clf.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
최적 하이퍼파라미터:  {'xgbclassifier__learning_rate': 0.3, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 70}
Score:  0.8158674369436701


In [8]:
final_model =  clf.best_estimator_

y_pred = final_model.predict(X_val)

accuracy_score(y_val, y_pred)

0.8475336322869955

In [9]:
# Predict test data and save
y_pred = final_model.predict(X_test)
submit = pd.DataFrame({'PassengerId': pd.read_csv('gender_submission.csv')['PassengerId'], 'Survived': y_pred})
submit.to_csv('submission_ver1.csv', index=False)