# LGBM 하이퍼파라미터 튜닝
### 강아지, catboost 인코딩 튜닝코드

In [24]:
import os
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from lightgbm import LGBMModel,LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from itertools import product
import random
from sklearn.model_selection import StratifiedKFold
import scipy.stats as ss
from category_encoders.cat_boost import CatBoostEncoder
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import lightgbm as lgb
from lightgbm import LGBMModel,LGBMClassifier
from math import sqrt
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from statistics import *


os.chdir("C:/Users/hjh05/Downloads/ugi")

## Train튜닝 (ver. 캣부스트 인코딩)
#### 추가 파라미터 튜닝 필요

In [8]:
train_dog=pd.read_csv('data/final_train_dog.csv')

In [9]:
#범주형 변환 adoptionYN, neuterYN, sex, group_akc, color
train_dog['adoptionYN'] = train_dog['adoptionYN'].astype('category')
cols = ['neuterYN','sex','group_akc','color']
for col in cols:
    train_dog[col] = train_dog[col].astype('object')
print (train_dog.dtypes)

adoptionYN      category
weight_kg        float64
neuterYN          object
sex               object
positives          int64
negatives          int64
group_akc         object
grdp             float64
economy          float64
hospital_num       int64
color             object
dtype: object


In [10]:
x_data = train_dog.drop(['adoptionYN'],axis=1)
y_data= train_dog['adoptionYN']

In [11]:
#fold 나누기
folds = StratifiedKFold(n_splits=5, random_state=613)

In [21]:
#########수정필
#파라미터 튜닝용 df!!
learning_rates = np.array([0.05, 0.1, 0.3, 0.5, 0.7, 0.99])
num_leaves = np.array([15, 31, 63, 127, 255, 511])
max_depth = np.array([3, 12])

par= pd.DataFrame(list(product(*[learning_rates, num_leaves,max_depth])))

In [30]:
### 코드
acc=[]
f1score=[]

for i in tqdm(range(0,len(par))):
    acc_here=[]
    f1score_here=[]

    for train_index, val_index in folds.split(x_data, y_data):
        x_train, x_val = x_data.iloc[train_index], x_data.iloc[val_index]
        y_train, y_val = y_data.iloc[train_index], y_data.iloc[val_index]
    
        #scaling
        scaled_x_train = x_train
        scaled_x_val = x_val

        scaler.fit(x_train[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']])
        scaled_x_train[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']] = scaler.transform(scaled_x_train[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']])
        scaled_x_val[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']] = scaler.transform(scaled_x_val[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']])

        #encoding?
        feature_list = list(scaled_x_train.columns)
        CBE_encoder = CatBoostEncoder()
        tuned_x_train = CBE_encoder.fit_transform(scaled_x_train[feature_list], y_train)
        tuned_x_val = CBE_encoder.transform(scaled_x_val[feature_list])
        
        #학습
        lgbm_ml = LGBMClassifier(learning_rate = par.iloc[i,0],
              num_leaves=par.iloc[i,1],
            max_depth=par.iloc[i,2],
              random_state=613)
        
        lgbm_ml.fit(tuned_x_train, y_train)

        #prediction
        y_pred = lgbm_ml.predict(tuned_x_val)
        y_pred[y_pred>=0.5]=1
        y_pred[y_pred<0.5]=0
        
        acc1 = accuracy_score(y_val, y_pred)
        acc_here.append(acc1)
        f1 = f1_score(y_val, y_pred)
        f1score_here.append(f1)
        
    acc.append(np.mean(acc_here))
    f1score.append(np.mean(f1score_here))




In [31]:
# 결과확인
par['acc']=acc
par['f1score'] = f1score

par.sort_values(by=['f1score'], axis=0, ascending=False).head()

Unnamed: 0,0,1,2,acc,f1score
19,0.1,127,12,0.740018,0.60896
17,0.1,63,12,0.743898,0.607881
7,0.05,127,12,0.743773,0.607007
21,0.1,255,12,0.736209,0.606766
23,0.1,511,12,0.736477,0.605636


In [29]:
#par.to_csv('lgbm_dog_ctb.csv', index=False, header=True,encoding='euc-kr')

##  test에 결과 적용

In [32]:
train_dog=pd.read_csv('data/final_train_dog.csv')
test_dog=pd.read_csv('data/final_test_dog.csv')

In [33]:
#train 범주형 변환 adoptionYN, neuterYN, sex, group_akc, color
train_dog['adoptionYN'] = train_dog['adoptionYN'].astype('category')
cols = ['neuterYN','sex','group_akc','color']
for col in cols:
    train_dog[col] = train_dog[col].astype('object')
print (train_dog.dtypes)

adoptionYN      category
weight_kg        float64
neuterYN          object
sex               object
positives          int64
negatives          int64
group_akc         object
grdp             float64
economy          float64
hospital_num       int64
color             object
dtype: object


In [34]:
#test 범주형 변환 adoptionYN, neuterYN, sex, group_akc, color
test_dog['adoptionYN'] = test_dog['adoptionYN'].astype('category')
test_dog['neuterYN'] = test_dog['neuterYN'].astype('object')
test_dog['sex'] = test_dog['sex'].astype('object')
test_dog['group_akc'] = test_dog['group_akc'].astype('object')
test_dog['color'] = test_dog['color'].astype('object')

In [35]:
x_train = train_dog.drop(['adoptionYN'],axis=1)
y_train= train_dog['adoptionYN']

x_test = test_dog.drop(['adoptionYN'],axis=1)
y_test= test_dog['adoptionYN']

In [36]:
#train 스케일링
scaler.fit(x_train[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']])
x_train[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']] = scaler.transform(x_train[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']])

In [37]:
#test 스케일링
scaler.fit(x_test[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']])
x_test[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']] = scaler.transform(x_test[['weight_kg', 'positives', 'negatives', 'grdp', 'economy','hospital_num']])

In [38]:
#인코딩
feature_list = list(x_train.columns)
CBE_encoder = CatBoostEncoder()
train_cbe = CBE_encoder.fit_transform(x_train[feature_list], y_train)
test_cbe = CBE_encoder.transform(x_test[feature_list])

In [39]:
#적합
lgbm_ml = LGBMClassifier(learning_rate = 0.10, num_leaves=127, max_depth=12, random_state=613)        
lgbm_ml.fit(train_cbe, y_train)

LGBMClassifier(max_depth=12, num_leaves=127, random_state=613)

In [40]:
#예측
y_pred = lgbm_ml.predict(test_cbe)
y_pred[y_pred>=0.5]=1
y_pred[y_pred<0.5]=0

        
print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


0.6195717191051562
0.7346461949265688
