In [1]:
import pickle
import pandas as pd

with open('titanic_step4_importance_train.pickle', 'rb') as pickle_filename:
    train_importance = pickle.load(pickle_filename)
with open('titanic_step4_importance_test.pickle', 'rb') as pickle_filename:
    test_importance = pickle.load(pickle_filename)
with open('titanic_step4_importance_train_y.pickle', 'rb') as pickle_filename:
    train_answer = pickle.load(pickle_filename)

In [2]:
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

### 모델 재트레이닝

In [6]:
# loggistic model
logreg_model = LogisticRegression(
    C=18.288277344191805,
    penalty='l2',
    random_state=1
)

# xgb
xgb_model = XGBClassifier(
    eval_metric = 'logloss',
    learning_rate=0.17,
    n_estimators=10,
    max_depth=6,
    min_child_weight=1,
    gamma=0.2,
    reg_alpha=0.01,
    colsample_bytree=0.85,
    subsample=0.9,
    random_state=1
)
# randomForest
random_model = RandomForestClassifier(
    max_depth=None,
    max_features=0.8,
    min_samples_leaf=2,
    min_samples_split=6,
    n_estimators=200,
    random_state=1
)

#extra tree
extra_model = ExtraTreesClassifier(
    max_depth=None,
    max_features=0.5,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=50,
    random_state=1
)

### Voting Classifier
- 일종의 앙상블(ensemble) 의 또다른 기법3 으로(기법1은 Bagging, 기법2는 Boosting), 여러 모델들을 기반으로, 투표를 하는 Voting 기법이 있음
- 해당 기법을 사용하여 성능이 괜찮은 모델을 기반으로 Voting 을 해서, 또다른 예측 모델을 구성할 수 있음
- Voting 기법도 크게 Hard Voting 기법과 Soft Voting 기법이 존재함

### Hard Voting Classifier
- 여러 모델이 예측한 분류 중, 가장 많은 모델이 예측한 분류를 선택하는 기법

<img src="https://www.fun-coding.org/00_Images/hardvoting.png">

In [7]:
from sklearn.ensemble import VotingClassifier

grid_hard = VotingClassifier(estimators= [
    ('Logistic Regression', logreg_model),
    ('XGBoost', xgb_model),
    ('Random Forest', random_model),
    ('Extra Trees', extra_model),
], voting='hard')

score = cross_val_score(grid_hard, train_importance, train_answer, cv=5, scoring='accuracy')
print(np.mean(score)*100)

84.96076831335134


### Soft Voting Classifier
- 여러 모델을 확률로 예측 분류한 후, 예측 확률의 평균을 내어, 확률이 가장 높은 분류를 선택하는 기법

<img src="https://www.fun-coding.org/00_Images/softvoting.png">

In [8]:
# 튜닝한 파라미터로 소프트보팅
grid_soft = VotingClassifier(estimators = [
    ('Logistic Regression', logreg_model),
    ('XGBoost', xgb_model),
    ('Random Forest', random_model),
    ('Extra Trees', extra_model),
], voting = 'soft')

score = cross_val_score(grid_soft, train_importance, train_answer, cv=5, scoring='accuracy')
print(np.mean(score)*100)

84.28661100998053


In [9]:
test = pd.read_csv('../Study_For_MachineLearning/test.csv')
submission = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission['PassengerId'] = test['PassengerId']
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,
1,893,
2,894,
3,895,
4,896,


In [10]:
grid_hard.fit(train_importance, train_answer)

In [11]:
submission["Survived"] = grid_hard.predict(test_importance)
submission = submission.astype("int")
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [12]:
submission.to_csv('titanic_predict.csv', header=True, index=False)

In [13]:
!kaggle competitions submit -c titanic -f titanic_predict.csv -m "Test"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 29.1kB/s]
100%|##########| 3.18k/3.18k [00:03<00:00, 878B/s]  
