In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 데이터 불러오기
data = pd.read_csv('../data/shipping_preprocessed.csv')

In [2]:
X, y = data.drop('Reached_on_Time_y_n', axis=1), data['Reached_on_Time_y_n']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# xgboost 모델 학습

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier

model_xgb = xgb.XGBClassifier()
model_lgb = lgb.LGBMClassifier()
model_RF = RandomForestClassifier()

score ={}
for model in [model_xgb, model_lgb, model_RF]:
    model.fit(x_train, y_train)
    y__pred = model.predict(x_test)
    # 모델 평가
    from sklearn.metrics import roc_auc_score
    score[model] = roc_auc_score(y_test, y__pred)



[LightGBM] [Info] Number of positive: 5258, number of negative: 3541
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 843
[LightGBM] [Info] Number of data points in the train set: 8799, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.597568 -> initscore=0.395342
[LightGBM] [Info] Start training from score 0.395342


In [16]:
# Grid Search를 통한 하이퍼 파라미터 튜닝
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_RF = GridSearchCV(RandomForestClassifier(), param_grid, scoring=make_scorer(roc_auc_score))
grid_RF.fit(x_train, y_train)
best_RF = grid_RF.best_estimator_
score[best_RF] = roc_auc_score(y_test, best_RF.predict(x_test))

In [20]:
# 결과 추출
results = pd.DataFrame(grid_RF.cv_results_)

In [22]:
results.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.819252,0.018855,0.031878,0.000627,10,2,200,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.712408,0.722957,0.707436,0.700347,0.726894,0.714009,0.009783,1
8,1.201261,0.028906,0.046631,0.000953,10,10,300,"{'max_depth': 10, 'min_samples_split': 10, 'n_...",0.708798,0.720567,0.69941,0.697727,0.731548,0.71161,0.012863,2
5,1.211009,0.029197,0.046209,0.000573,10,5,300,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.707438,0.720323,0.701067,0.701502,0.727341,0.711534,0.010527,3
3,0.414702,0.005203,0.017474,0.000143,10,5,100,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.706077,0.722522,0.695458,0.702262,0.724285,0.710121,0.01138,4
6,0.411092,0.006709,0.017304,0.000237,10,10,100,"{'max_depth': 10, 'min_samples_split': 10, 'n_...",0.705501,0.71474,0.700143,0.704381,0.725021,0.709957,0.008911,5
7,0.779766,0.01246,0.031226,0.000489,10,10,200,"{'max_depth': 10, 'min_samples_split': 10, 'n_...",0.705747,0.717525,0.702439,0.693924,0.727802,0.709488,0.011882,6
4,0.838158,0.007799,0.0325,0.000317,10,5,200,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.709078,0.712106,0.699926,0.69607,0.727081,0.708852,0.010826,7
2,1.268065,0.054026,0.048281,0.000392,10,2,300,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.705109,0.719195,0.701285,0.695146,0.722843,0.708716,0.0106,8
0,0.424155,0.009109,0.017421,0.000456,10,2,100,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.704667,0.718489,0.693992,0.694902,0.724114,0.707233,0.012212,9
25,1.109147,0.030585,0.042891,0.000567,30,10,200,"{'max_depth': 30, 'min_samples_split': 10, 'n_...",0.685223,0.697353,0.665115,0.680407,0.684773,0.682574,0.01039,10


In [25]:
best_RF

In [34]:
# 기존 RandomForest 모델
model_RF.fit(x_train, y_train)
y_predict = model_RF.predict(x_test)

# 모델 평가
roc_auc_score(y_test, y_predict)

0.6898499539801795

In [40]:
# 최적화된 RandomForest 모델
final_model = RandomForestClassifier(max_depth=10, n_estimators=200, random_state=42)
final_model.fit(x_train, y_train)
y_predict = final_model.predict(x_test)

# 모델 평가
roc_auc_score(y_test, y_predict)

0.72720948650442

In [42]:
final_model