### Decision Tree Regression (회귀 트리)
- 결정 트리와 결정 트리 기반의 앙상블 알고리즘은 분류뿐 아니라 회귀분석도 가능하다.
- 분류와 유사하게 분할하며, 최종 분할 후 각 분할 영역에서 실제 데이터까지의 거리들의 평균 값으로 학습 및 예측을 수행한다.

<img src='./images/decision_tree_regression01.png' width='600px' style='margin-left:20px'>

- 회귀 트리 역시 복잡한 트리 구조를 가질 경우 과적합의 위험이 있고, 트리 크기와 노드의 개수 제한등으로 개선해야 한다.

<img src='./images/decision_tree_regression02.png' width='600px' style='margin-left:20px'>

- 독립 변수들과 종속 변수 사이의 관계가 상당히 비선형적일 경우 사용하는 것이 좋다.

<img src='./images/decision_tree_regression03.png' width='600px' style='margin-left:20px'>

In [None]:
import chardet

rawdata = open('./datasets/seoul_bicycle.csv', 'rb').read()
result = chardet.detect(rawdata)
result

In [10]:
import pandas as pd

import pandas as pd

c_df = pd.read_csv('./datasets/korea_cow.csv', encoding='EUC-KR')
c_df


Unnamed: 0,일자,번호,출하주,개체번호,성별,kpn,계대,중량,최저가,낙찰가,상태,비고,종류,지역
0,2021.07.23,4,서*호,48928970,암,550.0,3.0,580,360,363,낙찰,목.배밑혹,큰소,경상남도고성
1,2021.07.23,5,이*락,102112702,암,744.0,2.0,460,320,353,낙찰,,큰소,경상남도고성
2,2021.07.23,7,문*종,156144852,암,1263.0,4.0,340,400,471,낙찰,목이모색 상처,큰소,경상남도고성
3,2021.07.23,8,문*종,136983661,암,1159.0,2.0,380,400,432,낙찰,뒷다리약간절음,큰소,경상남도고성
4,2021.07.23,9,이*만,138655532,암,1124.0,6.0,550,650,766,낙찰,,큰소,경상남도고성
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19976,2021.06.22,320,윤*식,157190517,암,0.0,1.0,0,390,0,유찰,,혈통우,전라남도 함평
19977,2021.06.22,321,윤*식,154652064,암,0.0,1.0,0,430,0,유찰,,혈통우,전라남도 함평
19978,2021.06.22,322,윤*식,156278395,암,0.0,1.0,0,450,0,유찰,,혈통우,전라남도 함평
19979,2021.06.22,323,윤*식,155232402,암,0.0,1.0,0,460,530,낙찰,정영기 -> 박손엽,혈통우,전라남도 함평


In [11]:
columns = ['성별', '중량','상태', '종류','낙찰가']
pre_c_df = c_df.loc[:,columns]

In [12]:
pre_c_df = pre_c_df[pre_c_df.상태 == '낙찰']
pre_c_df

Unnamed: 0,성별,중량,상태,종류,낙찰가
0,암,580,낙찰,큰소,363
1,암,460,낙찰,큰소,353
2,암,340,낙찰,큰소,471
3,암,380,낙찰,큰소,432
4,암,550,낙찰,큰소,766
...,...,...,...,...,...
19973,암,0,낙찰,혈통우,460
19974,암,0,낙찰,혈통우,451
19975,암,0,낙찰,혈통우,480
19979,암,0,낙찰,혈통우,530


In [13]:
pre_c_df = pre_c_df[pre_c_df.성별.isin(['수', '암'])]

In [15]:
m_cow = pre_c_df[pre_c_df.성별 == '수'].sample(7426, random_state=124)
f_cow = pre_c_df[pre_c_df.성별 == '암']
pre_c_df = pd.concat([m_cow,f_cow])

In [17]:
pre_c_df =pre_c_df.drop(labels=['중량'], axis=1)
pre_c_df =pre_c_df.drop(labels=['상태'], axis=1)

In [19]:
pre_c_df.reset_index()

Unnamed: 0,index,성별,종류,낙찰가
0,10679,수,혈통우,291
1,17948,수,혈통우,459
2,13777,수,혈통우,289
3,1691,수,큰소,556
4,9690,수,혈통우,519
...,...,...,...,...
14847,19973,암,혈통우,460
14848,19974,암,혈통우,451
14849,19975,암,혈통우,480
14850,19979,암,혈통우,530


In [20]:
s_c = pre_c_df[pre_c_df.종류 == '혈통우'].sample(4523, random_state=124)
b_c = pre_c_df[pre_c_df.종류 == '큰소']
pre_c_df = pd.concat([s_c, b_c])

In [22]:
pre_c_df.reset_index(drop=True, )

Unnamed: 0,성별,종류,낙찰가
0,수,혈통우,336
1,수,혈통우,549
2,암,혈통우,428
3,수,혈통우,376
4,수,혈통우,579
...,...,...,...
9041,암,큰소,856
9042,암,큰소,520
9043,암,큰소,907
9044,암,큰소,927


In [23]:
from sklearn.preprocessing import LabelEncoder

columns = ['성별' , '종류' ]
encoders = {}

for column in columns:
    encoder = LabelEncoder()
    result = encoder.fit_transform(pre_c_df[column])
    pre_c_df[column] = result
    encoders[column] = encoder.classes_
    

In [24]:
from statsmodels.api import OLS

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

model = OLS(targets, features)
print(model.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                    낙찰가   R-squared (uncentered):                   0.741
Model:                            OLS   Adj. R-squared (uncentered):              0.741
Method:                 Least Squares   F-statistic:                          1.294e+04
Date:                Mon, 29 Apr 2024   Prob (F-statistic):                        0.00
Time:                        11:43:37   Log-Likelihood:                         -63376.
No. Observations:                9046   AIC:                                  1.268e+05
Df Residuals:                    9044   BIC:                                  1.268e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def get_vif(features):
    vif = pd.DataFrame()
    vif['vif_score'] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]
    vif['feature'] = features.columns
    return vif

In [26]:
get_vif(features)

Unnamed: 0,vif_score,feature
0,1.112592,성별
1,1.112592,종류


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

l_r = LinearRegression()
l_r.fit(X_train, y_train)

In [28]:
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

def get_evaluation(y_test, prediction):
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)
    print('MSE: {:.4f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'\
          .format(MSE, RMSE, MSLE, RMSLE, R2))

In [37]:
import numpy as np

prediction = l_r.predict(X_test)

get_evaluation(y_test, prediction)

MSE: 14652.4119, RMSE: 121.0471, MSLE: 0.0598, RMSLE: 0.2446, R2: 0.1748


In [34]:
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

def get_evaluation(y_test, prediction):
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)
    print('MSE: {:.4f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'\
          .format(MSE, RMSE, MSLE, RMSLE, R2))

In [36]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

dt_r = DecisionTreeRegressor(random_state=124)
rf_r = RandomForestRegressor(random_state=124)
gb_r = GradientBoostingRegressor(random_state=124)
xgb_r = XGBRegressor()
lgb_r = LGBMRegressor()

models = [dt_r, rf_r, gb_r, xgb_r, lgb_r]
for model in models:
    model.fit(X_train, y_train)
    prediction = model.predict(X_train)
    print(model.__class__.__name__)
    get_evaluation(y_test, prediction)

DecisionTreeRegressor


ValueError: Found input variables with inconsistent numbers of samples: [1810, 7236]

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

rf_r = RandomForestRegressor(random_state=124)

parameters = {'max_depth': [4, 8, 12, 20], 'min_samples_split': [20, 30, 40, 50, 60]}
kfold = KFold(n_splits=10, random_state=124, shuffle=True)

# GridSearchCV(rf_r, param_grid=parameters, scoring='neg_mean_squared_error', cv=kfold)
grid_rf_r = GridSearchCV(rf_r, param_grid=parameters, scoring='r2', cv=kfold)
grid_rf_r.fit(X_train, y_train)

In [39]:
result_df = pd.DataFrame(grid_rf_r.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
display(result_df)

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 4, 'min_samples_split': 20}",0.254433,1
1,"{'max_depth': 4, 'min_samples_split': 30}",0.254433,1
2,"{'max_depth': 4, 'min_samples_split': 40}",0.254433,1
3,"{'max_depth': 4, 'min_samples_split': 50}",0.254433,1
4,"{'max_depth': 4, 'min_samples_split': 60}",0.254433,1
5,"{'max_depth': 8, 'min_samples_split': 20}",0.254433,1
6,"{'max_depth': 8, 'min_samples_split': 30}",0.254433,1
7,"{'max_depth': 8, 'min_samples_split': 40}",0.254433,1
8,"{'max_depth': 8, 'min_samples_split': 50}",0.254433,1
9,"{'max_depth': 8, 'min_samples_split': 60}",0.254433,1
