# titanic data로 부스트 총 정리!

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

In [None]:
import pandas as pd

titanic = pd.read_csv('/gdrive/My Drive/data/titanic.csv')
display(data.head())

In [None]:
# 업로드 방식

from google.colab import files
uploaded = files.upload()
import io
import pandas as pd

data = pd.read_csv(io.BytesIO(uploaded['titanic.csv']))
display(data.head())

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [None]:
# data에 대한 정보 확인
data.info()

# 891개의 데이터 Age, Cabin에 null값이 있음

In [None]:
y_data = data['Survived']
data.drop(labels='Survived', axis=1, inplace=True)
display(data.head())

In [None]:
# 필요없는 변수 날리기
drop_columns = ['PassengerId', 'Name','Age','SibSp', 'Ticket','Cabin','Parch','Embarked']
data.drop(labels=drop_columns, axis=1, inplace=True)
display(data.head())

# Age 넣어줘도 무방 단, 실수형이기 때문에 카테고리형으로 변환해주는 것이 좋음 why??

In [None]:
# 성별 원 핫 인코딩
data=pd.get_dummies(data, columns=['Sex'])
data.fillna(value=0.0, inplace=True)
display(data.head())

In [None]:
# train, test셋 나눠주기
state = 42
test_size=0.30

X_train, X_test, y_train, y_test = train_test_split(data, y_data, test_size=test_size, random_state=state)

# Gradient Boosting

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Grid Search를 통해 최적의 파라미터 값을 찾아봅시다
parameters = {
    "learning_rate":[0.01,0.025,0.05,0.075,0.1,0.15,0.2],
    "max_depth":[3,5,8],
    "n_estimators":[10,50,100]
}

gb_cv = GridSearchCV(GradientBoostingClassifier(), param_grid=parameters, cv=5, n_jobs=-1)
gb_cv.fit(X_train, y_train)

print('final params', gb_cv.best_params_)
print('best score', gb_cv.best_score_)

cv_result_df = pd.DataFrame(gb_cv.cv_results_)
cv_result_df.sort_values(by=['rank_test_score'], inplace=True)
cv_result_df[['params', 'mean_test_score', 'rank_test_score']].head(10)

In [None]:
print(cv_result_df.iloc[:,0:6])

In [None]:
# classification_report 만들기
from sklearn.metrics import classification_report

gb_clf2 = GradientBoostingClassifier(n_estimators=50, learning_rate=0.2, max_depth=5, random_state=0)   # 위에서 구한 최적값을 넣어주고
gb_clf2.fit(X_train, y_train)               # 트레인 데이터를 학습 시킨 후
predictions = gb_clf2.predict(X_test)       # 테스트 값 예측

print('Classification Report')
print(classification_report(y_test, predictions))   # 실제값과 예측값을 비교한 리포트 뽑아보기 이 모델은 81%의 성능을 보인다

In [None]:
# 변수 중요도를 그래프로 그려봅시다
feature_imp=pd.Series(gb_clf2.feature_importances_, index=data.columns).sort_values(ascending=True) # 모델의 변수 중요도를 시리즈에 넣어주고 sort_values를 해줌
print(feature_imp)
feature_imp.plot(kind='barh', grid=True, figsize=(5,5))
plt.show()

# XGBoosting

In [None]:
from xgboost import plot_importance
from xgboost import XGBClassifier

In [None]:
xgg = XGBClassifier()

# Grid Search를 통해 최적의 파라미터 값을 찾아봅시다
parameters = {
    "learning_rate":[0.01,0.025,0.05,0.075,0.1,0.15,0.2],  # 0.01나 0.02 를 많이 씀
    "max_depth":[3,5,8],
    "n_estimators":[10,50,100]
}

xgg_cv = GridSearchCV(xgg, param_grid=parameters, cv=5, scoring='accuracy', n_jobs=1)
xgg_cv.fit(X_train, y_train)

print('final params', xgg_cv.best_params_)
print('best score', xgg_cv.best_score_)

cv_result_df = pd.DataFrame(xgg_cv.cv_results_)
cv_result_df.sort_values(by=['rank_test_score'], inplace=True)
cv_result_df[['params', 'mean_test_score', 'rank_test_score']].head(10)

In [None]:
# classification_report 만들기
from sklearn.metrics import classification_report

xgg_clf2 = XGBClassifier(n_estimators=100, learning_rate=0.2, max_depth=8, random_state=0)   # 위에서 구한 최적값을 넣어주고

X_train = pd.DataFrame(X_train, columns=data.columns)
X_test = pd.DataFrame(X_test, columns=data.columns)

xgg_clf2.fit(X_train, y_train)               # 트레인 데이터를 학습 시킨 후
predictions = xgg_clf2.predict(X_test)       # 테스트 값 예측

print('Classification Report')
print(classification_report(y_test, predictions))   # 실제값과 예측값을 비교한 리포트 뽑아보기 이 모델은 81%의 성능을 보인다

In [None]:
fscore = xgg_clf2.get_booster().get_fscore()
score_f = sorted(fscore.items(), key=(lambda x:x[1]), reverse=True)
print(score_f)

In [None]:
import matplotlib.pyplot as plt
plot_importance(xgg_clf2)
plt.show()

# Light GBM

In [None]:
from lightgbm import LGBMClassifier, plot_importance

In [None]:
lb = LGBMClassifier()

# Grid Search를 통해 최적의 파라미터 값을 찾아봅시다
parameters = {
    "learning_rate":[0.01,0.025,0.05,0.075,0.1,0.15,0.2],  # 0.01나 0.02 를 많이 씀
    "max_depth":[1,2,3,4],                                  # max_depth는 num_leaves에 영향을 받아서 변수 설정이 중요
    "n_estimators":[10,50,100]                              # num_leaves = 31이 default (max_depth)**2 -1 <= 31(num_leaves)
}

lgb_cv = GridSearchCV(lb, param_grid=parameters, cv=5, scoring='accuracy', n_jobs=1)
lgb_cv.fit(X_train, y_train)

print('final params', lgb_cv.best_params_)
print('best score', lgb_cv.best_score_)

cv_result_df = pd.DataFrame(lgb_cv.cv_results_)
cv_result_df.sort_values(by=['rank_test_score'], inplace=True)
cv_result_df[['params', 'mean_test_score', 'rank_test_score']].head(10)

In [None]:
# classification_report 만들기
from sklearn.metrics import classification_report

LG_clf2 = LGBMClassifier(n_estimators=50, learning_rate=0.025, max_depth=4, random_state=0)   # 위에서 구한 최적값을 넣어주고

X_train = pd.DataFrame(X_train, columns=data.columns)
X_test = pd.DataFrame(X_test, columns=data.columns)

LG_clf2.fit(X_train, y_train)               # 트레인 데이터를 학습 시킨 후
predictions = LG_clf2.predict(X_test)       # 테스트 값 예측

print('Classification Report')
print(classification_report(y_test, predictions))   # 실제값과 예측값을 비교한 리포트 뽑아보기 이 모델은 81%의 성능을 보인다

In [None]:
import matplotlib.pyplot as plt
plot_importance(LG_clf2)
plt.show()