# Decision Tree (의사결정나무)
### by 우현우 Hyunwoo Woo (Hub1)
#### Dept. of Industrial Engineering, Yonsei Univ (Industrial Statistics Lab; ISL 산업통계연구실)
#### hw.woo@yonsei.ac.kr   (010 8966 5705)
#### blog: https://hub1.tistory.com
#### github repository: https://github.com/HyunwooWoo

##### ----------------------------------------------------------------------------------

- Unofficial Algorithm in Python (C4.5)

## C4.5 알고리즘 연습

In [2]:
pip install chefboost

Collecting chefboost
  Downloading chefboost-0.0.11-py3-none-any.whl (26 kB)
Installing collected packages: chefboost
Successfully installed chefboost-0.0.11
Note: you may need to restart the kernel to use updated packages.


In [1]:
#import chefboost as chef
import pandas as pd
from chefboost import Chefboost as chef

In [302]:
df = pd.read_csv("titanic_data_clean.csv")

In [303]:
df

Unnamed: 0,Pclass,Sex,Fare,Survived
0,3,male,7.2500,0
1,1,female,71.2833,1
2,3,female,7.9250,1
3,1,female,53.1000,1
4,3,male,8.0500,0
...,...,...,...,...
886,2,male,13.0000,0
887,1,female,30.0000,1
888,3,female,23.4500,0
889,1,male,30.0000,1


In [304]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Fare      891 non-null    float64
 3   Survived  891 non-null    int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 28.0+ KB


In [305]:
df["Pclass"] = df.Pclass.map({1:"C1", 2:"C2", 3:"C3"})
df["Survived"] = df.Survived.map({0:"dead", 1:"survive"})

# 범주형은 확실하게 object 타입으로 바꿔야 한다. (숫자로 하면 안된다)
# Python에서 C4.5 알고리즘은 확실하게 범주형, 연속형 나타내줘야함
## 아래는 그냥 참고
## df['Pclass'] = df['Pclass'].astype('object')
## df['Survived'] = df['Survived'].astype('object')

In [306]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    object 
 1   Sex       891 non-null    object 
 2   Fare      891 non-null    float64
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 28.0+ KB


In [307]:
df.rename(columns={'Survived':'Decision'}, inplace=True) # C4.5를 하려면 Target variable 이름을 "Decision"으로 해야한다

In [308]:
df
# 0: dead, 1: survived
# 1,2,3등석을 각각 C1, C2, C3

Unnamed: 0,Pclass,Sex,Fare,Decision
0,C3,male,7.2500,dead
1,C1,female,71.2833,survive
2,C3,female,7.9250,survive
3,C1,female,53.1000,survive
4,C3,male,8.0500,dead
...,...,...,...,...
886,C2,male,13.0000,dead
887,C1,female,30.0000,survive
888,C3,female,23.4500,dead
889,C1,male,30.0000,survive


In [309]:
config = {'algorithm': 'C4.5'}
# config내에 {'max_depth': 숫자 }를 넣을 수도 있음
## ex: config = {'algorithm': 'C4.5', 'max_depth': 3}

In [314]:
model = chef.fit(df.copy(), config)
# 결과는 train만 가지고 성능을 평가한 것임.
# df가 아니라, df.copy()를 하는 이유는, 해당 모델을 쓸 경우 df 형태가 decision tree가 적용된 상태로 바뀐다.
# validation도 가능. validation_df=valid_df.copy()을 {}에 추가하면 됨

C4.5  tree is going to be built...
-------------------------
finished in  0.8468568325042725  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  78.78787878787878 % on  891  instances
Labels:  ['dead' 'survive']
Confusion matrix:  [[469, 109], [80, 233]]
Precision:  81.1419 %, Recall:  85.4281 %, F1:  83.2299 %


In [315]:
df.iloc[0]

Pclass        C3
Sex         male
Fare        7.25
Decision    dead
Name: 0, dtype: object

In [316]:
prediction = chef.predict(model, df.iloc[0])

In [317]:
prediction

'dead'

In [318]:
print("[actual", "-",  "predict]")
for index, instance in df.iterrows():
    prediction = chef.predict(model, instance) # dataframe의 각 행(관측치)마다 prediction 적용
    actual = instance['Decision'] # target (real)
    
    print(actual, " - ", prediction) # show "real target value - predicted target value"

[actual - predict]
dead  -  dead
survive  -  survive
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
dead  -  dead
dead  -  dead
survive  -  survive
survive  -  survive
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
dead  -  survive
survive  -  survive
dead  -  dead
survive  -  dead
dead  -  survive
survive  -  survive
dead  -  dead
survive  -  dead
survive  -  survive
survive  -  dead
dead  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
dead  -  dead
survive  -  dead
dead  -  dead
dead  -  survive
survive  -  survive
dead  -  survive
dead  -  survive
dead  -  dead
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
dead  -  dead
dead  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
survive  -  survive
dead  -  dead
survive  -  dead
survive  -  survive
dead  -  dead
survive

In [319]:
print("[actual", "-",  "predict]")
for index, instance in df.iterrows():
    prediction = chef.predict(model, instance) # dataframe의 각 행(관측치)마다 prediction 적용
    actual = instance['Decision'] # target (real)
    
    if actual == prediction:
        classified = True
    else:
        classified = False
        print("*", end='') # 만약 예측이 틀리면 *표시가 나옴
    
    print(actual, " - ", prediction) # show "real target value - predicted target value"

[actual - predict]
dead  -  dead
survive  -  survive
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
dead  -  dead
dead  -  dead
survive  -  survive
survive  -  survive
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
*dead  -  survive
survive  -  survive
dead  -  dead
*survive  -  dead
*dead  -  survive
survive  -  survive
dead  -  dead
*survive  -  dead
survive  -  survive
*survive  -  dead
*dead  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
dead  -  dead
*survive  -  dead
dead  -  dead
*dead  -  survive
survive  -  survive
*dead  -  survive
*dead  -  survive
dead  -  dead
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
dead  -  dead
*dead  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
survive  -  survive
dead  -  dead
*survive  -  dead
survive  -  survive
dead  -  

## 실전 연습 (Classification)

In [320]:
df

Unnamed: 0,Pclass,Sex,Fare,Decision
0,C3,male,7.2500,dead
1,C1,female,71.2833,survive
2,C3,female,7.9250,survive
3,C1,female,53.1000,survive
4,C3,male,8.0500,dead
...,...,...,...,...
886,C2,male,13.0000,dead
887,C1,female,30.0000,survive
888,C3,female,23.4500,dead
889,C1,male,30.0000,survive


In [321]:
# Features(x)와 Target(y) 데이터 구분하기
df_x = df[df.columns[:-1]]
df_y = df[df.columns[-1]]

In [322]:
# train: validation = 7:3 분리
from sklearn.model_selection import train_test_split
train_x, valid_x , train_y, valid_y = train_test_split(df_x, df_y, test_size = 0.3, stratify=df_y, random_state = 2021)

In [323]:
import pandas as pd
print(pd.DataFrame(train_y)['Decision'].value_counts())

dead       384
survive    239
Name: Decision, dtype: int64


In [324]:
# train data 오버샘플링
import sklearn
from imblearn.over_sampling import *

x_shuffled = sklearn.utils.shuffle(train_x, random_state=2021)
y_shuffled =sklearn.utils.shuffle(train_y, random_state=2021)
train_x, train_y = RandomOverSampler(random_state=2021).fit_resample(x_shuffled, y_shuffled)

In [325]:
import pandas as pd
print(pd.DataFrame(train_y)['Decision'].value_counts())

dead       384
survive    384
Name: Decision, dtype: int64


In [326]:
# train_df (train_x와 train_y 합치기)
train_df = train_x.copy()
train_df['Decision'] = train_y

In [327]:
train_df

Unnamed: 0,Pclass,Sex,Fare,Decision
0,C3,male,7.0500,dead
1,C1,male,52.5542,survive
2,C2,female,26.0000,dead
3,C3,male,7.8292,dead
4,C1,male,76.7292,survive
...,...,...,...,...
763,C3,female,7.7500,survive
764,C1,female,83.1583,survive
765,C1,male,26.3875,survive
766,C2,male,18.7500,survive


In [331]:
# valid_df (valid_x와 valid_y 합치기)
valid_df = valid_x.copy()
valid_df['Decision'] = valid_y

In [332]:
config = {'algorithm': 'C4.5'}

In [333]:
from chefboost import Chefboost as chef
model = chef.fit(train_df.copy(), config, validation_df=valid_df.copy())
# 결과는 train만 가지고 성능을 평가한 것임.
# df가 아니라, df.copy()를 하는 이유는, 해당 모델을 쓸 경우 df 형태가 decision tree가 적용된 상태로 바뀐다.

C4.5  tree is going to be built...
-------------------------
finished in  1.3098342418670654  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  77.99479166666667 % on  768  instances
Labels:  ['dead' 'survive']
Confusion matrix:  [[283, 68], [101, 316]]
Precision:  80.6268 %, Recall:  73.6979 %, F1:  77.0068 %
-------------------------
Evaluate  validation set
-------------------------
Accuracy:  76.11940298507463 % on  268  instances
Labels:  ['dead' 'survive']
Confusion matrix:  [[119, 18], [46, 85]]
Precision:  86.8613 %, Recall:  72.1212 %, F1:  78.8079 %


In [334]:
print("[actual", "-",  "predict]")
pred_y = []
for index, instance in valid_df.iterrows():
    prediction = chef.predict(model, instance) # dataframe의 각 행(관측치)마다 prediction 적용
    actual = instance['Decision'] # target (real)
    pred_y.append(prediction)
    
    print(actual, " - ", prediction) # show "real target value - predicted target value"

[actual - predict]
dead  -  dead
dead  -  dead
survive  -  survive
survive  -  survive
dead  -  survive
dead  -  survive
survive  -  survive
survive  -  survive
survive  -  survive
dead  -  survive
dead  -  survive
survive  -  survive
dead  -  survive
dead  -  survive
dead  -  survive
dead  -  dead
survive  -  survive
dead  -  survive
survive  -  survive
dead  -  dead
dead  -  survive
dead  -  dead
survive  -  survive
dead  -  survive
dead  -  dead
survive  -  survive
dead  -  dead
survive  -  survive
dead  -  dead
dead  -  dead
survive  -  survive
dead  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
dead  -  dead
survive  -  dead
dead  -  dead
dead  -  dead
dead  -  dead
dead  -  dead
survive  -  survive
dead  -  dead
survive  -  survive
survive  -  survive
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  survive
survive  -  survive
survive  -  survive
dead  -  dead
survive  -  survive
survive  -  survive
dead  -  dead
dead  -  dead
survive  -  dead
dead  -  

In [335]:
# 각각 confusion Matrix 구하는 법
cm_ap_00 = 0
cm_ap_01 = 0
cm_ap_10 = 0
cm_ap_11 = 0
pred_y = []
for index, instance in valid_df.iterrows():
    prediction = chef.predict(model, instance) # dataframe의 각 행(관측치)마다 prediction 적용
    actual = instance['Decision'] # target (real)
    pred_y.append(prediction)
    if actual == 'dead':
        if actual == prediction:
            cm_ap_00 = cm_ap_00 + 1
        else:
            cm_ap_01 = cm_ap_01 + 1
    else:
        if actual == prediction:
            cm_ap_11 = cm_ap_11 + 1
        else:
            cm_ap_10 = cm_ap_10 + 1

print(cm_ap_00)
print(cm_ap_01)
print(cm_ap_11)
print(cm_ap_10)

119
46
85
18


In [336]:
pred_y = pd.Series(pred_y)
pred_y

0         dead
1         dead
2      survive
3      survive
4      survive
        ...   
263    survive
264       dead
265    survive
266    survive
267       dead
Length: 268, dtype: object

In [337]:
pred_y = pred_y.map({'dead':0, 'survive':1})
valid_y = valid_y.map({'dead':0, 'survive':1})

In [338]:
pred_y

0      0
1      0
2      1
3      1
4      1
      ..
263    1
264    0
265    1
266    1
267    0
Length: 268, dtype: int64

In [339]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(valid_y, pred_y)

# Confusion Matrix 설명

#          예측값
# 실제값  TN   FP
#         FN   TP

# 기준: y=0: Negative, y=1: Positive
# 기준: y=0: 사망, y=1: 생존

array([[119,  46],
       [ 18,  85]], dtype=int64)

In [340]:
from sklearn.metrics import classification_report

print('Classification Report')
print(classification_report(valid_y, pred_y))

# 0: dead를 Positive로 봤을때 관점
# 1: survive를 Positive로 봤을때 관점

Classification Report
              precision    recall  f1-score   support

           0       0.87      0.72      0.79       165
           1       0.65      0.83      0.73       103

    accuracy                           0.76       268
   macro avg       0.76      0.77      0.76       268
weighted avg       0.78      0.76      0.76       268



In [341]:
# 성능
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print('Accuracy: {0: .4f}'.format(accuracy_score(valid_y, pred_y)))
print('precision: {0: .4f}'.format(precision_score(valid_y, pred_y)))
print('recall: {0: .4f}'.format(recall_score(valid_y, pred_y)))
print('f1_score: {0: .4f}'.format(f1_score(valid_y, pred_y)))
print('AUC: {0: .4f}'.format(roc_auc_score(valid_y, pred_y)))

Accuracy:  0.7612
precision:  0.6489
recall:  0.8252
f1_score:  0.7265
AUC:  0.7732


In [342]:
valid_df

Unnamed: 0,Pclass,Sex,Fare,Decision
672,C2,male,10.5000,dead
832,C3,male,7.2292,dead
435,C1,female,120.0000,survive
618,C2,female,39.0000,survive
49,C3,female,17.8000,dead
...,...,...,...,...
329,C1,female,57.9792,survive
637,C2,male,26.2500,dead
109,C3,female,24.1500,survive
141,C3,female,7.7500,survive


In [343]:
config = {'algorithm': 'C4.5'}
chef.fit(train_df.copy(), config, validation_df=valid_df.copy())


# 단 여기서의 confsuion Matrix는 survive=(Negative), dead=(Positive)로 본 것임.

#          예측값(P, N)
# 실제값(P)  TP   FN
# 실제값(N)  FP   TN

# 기준: y: 사망(Positivie), 생존(Negative)

C4.5  tree is going to be built...
-------------------------
finished in  1.2527575492858887  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  77.99479166666667 % on  768  instances
Labels:  ['dead' 'survive']
Confusion matrix:  [[283, 68], [101, 316]]
Precision:  80.6268 %, Recall:  73.6979 %, F1:  77.0068 %
-------------------------
Evaluate  validation set
-------------------------
Accuracy:  76.11940298507463 % on  268  instances
Labels:  ['dead' 'survive']
Confusion matrix:  [[119, 18], [46, 85]]
Precision:  86.8613 %, Recall:  72.1212 %, F1:  78.8079 %


{'trees': [<module 'outputs/rules/rules' from 'C:\\Users\\hwwoo\\test_DM\\outputs/rules/rules.py'>],
 'alphas': [],
 'config': {'algorithm': 'C4.5',
  'enableRandomForest': False,
  'num_of_trees': 5,
  'enableMultitasking': False,
  'enableGBM': False,
  'epochs': 10,
  'learning_rate': 1,
  'max_depth': 3,
  'enableAdaboost': False,
  'num_of_weak_classifier': 4,
  'enableParallelism': False,
  'num_cores': 4},
 'nan_values': [['Fare', None]]}

## 실전 연습 (Regression)

In [2]:
# Boston Housing dataset
# data source: https://www.kaggle.com/kyasar/boston-housing
import pandas as pd
df = pd.read_csv("boston_housing.csv")

In [3]:
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [None]:
# Features
# crim: Per capita crims rate by town
# chas: Charles River dummy variable (1: tract bounds river, 0: otherwise)
# tax: full-value property-tax rate per $10,000
# lstat: % lower of the population

# Target
# medv: Median value of owner-occupied homes in $1000's

In [4]:
df = df.copy().loc[:, ['crim', 'chas', 'tax', 'lstat', 'medv'] ]

In [5]:
df

Unnamed: 0,crim,chas,tax,lstat,medv
0,0.00632,0,296.0,4.98,24.0
1,0.02731,0,242.0,9.14,21.6
2,0.02729,0,242.0,4.03,34.7
3,0.03237,0,222.0,2.94,33.4
4,0.06905,0,222.0,5.33,36.2
...,...,...,...,...,...
501,0.06263,0,273.0,9.67,22.4
502,0.04527,0,273.0,9.08,20.6
503,0.06076,0,273.0,5.64,23.9
504,0.10959,0,273.0,6.48,22.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   crim    506 non-null    float64
 1   chas    506 non-null    int64  
 2   tax     506 non-null    float64
 3   lstat   506 non-null    float64
 4   medv    506 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 19.9 KB


In [7]:
df.isnull().sum()

crim     0
chas     0
tax      0
lstat    0
medv     0
dtype: int64

In [8]:
df["chas"] = df.chas.map({0:"river", 1:"ow"})

# 범주형은 확실하게 object 타입으로 바꿔야 한다. (숫자로 하면 안된다)
# Python에서 C4.5 알고리즘은 확실하게 범주형, 연속형 나타내줘야함
## 아래는 그냥 참고
## df['chas'] = df['chas'].astype('object')

In [9]:
df.rename(columns={'medv':'Decision'}, inplace=True) # C4.5를 하려면 Target variable 이름을 "Decision"으로 해야한다
df

Unnamed: 0,crim,chas,tax,lstat,Decision
0,0.00632,river,296.0,4.98,24.0
1,0.02731,river,242.0,9.14,21.6
2,0.02729,river,242.0,4.03,34.7
3,0.03237,river,222.0,2.94,33.4
4,0.06905,river,222.0,5.33,36.2
...,...,...,...,...,...
501,0.06263,river,273.0,9.67,22.4
502,0.04527,river,273.0,9.08,20.6
503,0.06076,river,273.0,5.64,23.9
504,0.10959,river,273.0,6.48,22.0


In [10]:
# Features(x)와 Target(y) 데이터 구분하기
df_x = df[df.columns[:-1]]
df_y = df[df.columns[-1]]

In [11]:
# train: validation = 7:3 분리
from sklearn.model_selection import train_test_split
train_x, valid_x , train_y, valid_y = train_test_split(df_x, df_y, test_size = 0.3, random_state = 2021)

In [12]:
# train_df (train_x와 train_y 합치기) & valid_df (valid_x와 valid_y 합치기)
train_df = train_x.copy()
train_df['Decision'] = train_y

valid_df = valid_x.copy()
valid_df['Decision'] = valid_y

In [13]:
config = {'algorithm': 'C4.5'}
## ex: config = {'algorithm': 'C4.5', 'max_depth': 3}

In [14]:
from chefboost import Chefboost as chef
model = chef.fit(train_df.copy(), config, validation_df=valid_df.copy())
# df가 아니라, df.copy()를 하는 이유는, 해당 모델을 쓸 경우 df 형태가 decision tree가 적용된 상태로 바뀐다.

That's why, the algorithm is set to Regression to handle the data set.
Regression  tree is going to be built...
-------------------------
finished in  1.816143274307251  seconds
-------------------------
Evaluate  train set
-------------------------
MAE:  4.507627118644068
MSE:  43.77488700564972
RMSE:  6.616259290992888
RAE:  0.2681797755118155
RRSE:  0.6873722268038013
Mean:  22.71581920903954
MAE / Mean:  19.843559579177764 %
RMSE / Mean:  29.126219178395342 %
-------------------------
Evaluate  validation set
-------------------------
MAE:  5.238157894736842
MSE:  66.63815789473684
RMSE:  8.163219823987152
RAE:  0.3469176098090505
RRSE:  1.01257131142986
Mean:  22.10657894736842
MAE / Mean:  23.69501815368133 %
RMSE / Mean:  36.92665356961036 %


In [15]:
print("[actual", "-",  "predict]")
pred_y = []
for index, instance in valid_df.iterrows():
    prediction = chef.predict(model, instance) # dataframe의 각 행(관측치)마다 prediction 적용
    actual = instance['Decision'] # target (real)
    pred_y.append(prediction)
    
    print(actual, " - ", prediction) # show "real target value - predicted target value"

[actual - predict]
21.7  -  21.7
15.6  -  22.0
20.0  -  22.0
12.8  -  14.1
50.0  -  34.9
20.6  -  21.2
22.6  -  22.0
24.1  -  22.0
24.4  -  21.7
36.1  -  22.0
21.9  -  50.0
24.8  -  22.0
22.5  -  22.0
48.8  -  22.0
31.5  -  34.9
22.2  -  22.0
20.0  -  21.2
8.7  -  14.1
7.2  -  14.1
32.0  -  21.2
18.7  -  22.0
13.5  -  22.0
15.6  -  21.2
28.7  -  22.0
11.7  -  14.1
10.9  -  14.1
21.4  -  22.0
24.8  -  34.9
19.8  -  21.2
19.3  -  21.7
20.0  -  22.0
27.9  -  50.0
13.3  -  21.2
28.1  -  22.0
31.2  -  34.9
23.1  -  21.2
23.2  -  22.0
13.0  -  14.1
36.4  -  34.9
23.3  -  21.7
18.3  -  21.2
26.4  -  22.0
32.9  -  34.9
34.7  -  50.0
23.7  -  22.0
24.3  -  21.2
19.0  -  22.0
29.6  -  34.9
25.0  -  22.0
29.1  -  22.0
20.0  -  21.7
50.0  -  22.0
20.5  -  22.0
23.6  -  22.0
18.9  -  22.0
23.8  -  21.2
23.0  -  22.0
23.4  -  22.0
26.7  -  21.7
21.0  -  22.0
43.8  -  50.0
20.4  -  22.0
9.5  -  14.1
25.0  -  21.2
23.8  -  22.0
20.1  -  21.2
20.6  -  22.0
25.2  -  22.0
21.1  -  22.0
14.4  -  21.2
15.2

In [16]:
pred_y = pd.Series(pred_y)
pred_y

0      21.7
1      22.0
2      22.0
3      14.1
4      34.9
       ... 
147    21.2
148    21.2
149    14.1
150    14.1
151    22.0
Length: 152, dtype: float64

# 코드 뜯어보기

In [214]:
import inspect
print(inspect.getsource(chef))

import pandas as pd
import math
import numpy as np
import time
import imp
import pickle
import os
from os import path
import json

from chefboost.commons import functions, evaluate as eval
from chefboost.training import Preprocess, Training
from chefboost.tuning import gbm, adaboost, randomforest

#------------------------

def fit(df, config = {}, validation_df = None):
	
	"""
	Parameters:
		df (pandas data frame): Training data frame. The target column must be named as 'Decision' and it has to be in the last column
		
		config (dictionary):
			
			config = {
				'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression
				'enableParallelism' (boolean): False
				
				'enableGBM' (boolean): True,
				'epochs' (int): 7,
				'learning_rate' (int): 1,
				
				'enableRandomForest' (boolean): True,
				'num_of_trees' (int): 5,
				
				'enableAdaboost' (boolean): True,
				'num_of_weak_classifier' (int): 4
			}
			
		validation_df (pandas data frame): if nothing is passed to validation

- 참고:

오버샘플링

https://datascienceschool.net/03%20machine%20learning/14.02%20%EB%B9%84%EB%8C%80%EC%B9%AD%20%EB%8D%B0%EC%9D%B4%ED%84%B0%20%EB%AC%B8%EC%A0%9C.html


chefboost

https://pypi.org/project/chefboost/

https://www.youtube.com/watch?v=YYF993HTHf8

https://www.youtube.com/watch?v=kjhQHmtDaAA

https://github.com/serengil/chefboost

https://github.com/serengil/chefboost/find/master