# 1. 회귀문제

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, recall_score, f1_score, precision_score
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from boruta import BorutaPy

## 데이터 전처리

#### 1. 데이터 불러오기 및 Unnamed: 0 변수 제거

In [2]:
bicycle = pd.read_csv("./regression.csv")

In [3]:
bicycle.drop(labels="Unnamed: 0", axis=1, inplace=True)

#### 2. count를 y값, 나머지를 X값으로 처리

In [4]:
x = bicycle.drop(["count"], axis=1)
y = bicycle["count"]

#### 3. 학습, 테스트 데이터 8:2비율로 분할 
* random state=1 으로 지정

In [5]:
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=1)

#### 4. X 데이터 Standard Scale 진행

In [6]:
scaler = StandardScaler()
scaler.fit(train_x)

train_x = pd.DataFrame(scaler.transform(train_x), columns=train_x.columns)
test_x = pd.DataFrame(scaler.transform(test_x), columns=test_x.columns)

#### 5. 학습데이터 X 통계량 출력(describe 함수 사용)

In [7]:
train_x.describe()

Unnamed: 0,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5
count,1073.0,1073.0,1073.0,1073.0,1073.0,1073.0,1073.0,1073.0,1073.0
mean,9.301869000000001e-17,5.041013e-16,-3.212714e-17,-5.690798e-17,1.024344e-16,-4.4491700000000005e-17,-3.794728e-16,7.573935e-17,-2.5065380000000003e-17
std,1.000466,1.000466,1.000466,1.000466,1.000466,1.000466,1.000466,1.000466,1.000466
min,-1.754004,-2.664005,-0.1781313,-1.819309,-2.226817,-2.321415,-1.86106,-1.518514,-1.517693
25%,-0.8684045,-0.7304412,-0.1781313,-0.795803,-0.7741268,-0.878098,-0.6887126,-0.7015717,-0.7106802
50%,0.01719477,-0.02210593,-0.1781313,-0.06472713,-0.07282824,0.2835114,-0.02608163,-0.1988378,-0.3071738
75%,0.902794,0.6287968,-0.1781313,0.6663488,0.7787486,1.010387,0.6365494,0.3981586,0.4325879
max,1.640793,2.485784,5.613836,4.029298,2.381717,1.02082,4.357477,6.116756,3.996894


## 모델 학습 및 평가

#### 1. RandomForest, XGBoost, LightGBM 3가지 모델에 대해 학습데이터를 사용하여 최적의 하이퍼파라미터 탐색
- GridSearchCV cv=5를 통해 탐색
- Random Forest 하이퍼파라미터 후보 : max_depth=3,5,10 , n_estimators=100,200,300 , random_state=1 
- XGBoost 하이퍼파라미터 후보 : max_depth=3,5,10 , n_estimators=100,200,300 , learning_rate = 0.001,0.01,0.1,1 , gamma = 0.5,1,2  , random_state=1 
- LightGBM 하이퍼파라미터 후보 : max_depth=3,5,10 , n_estimators=100,200,300 , learning_rate = 0.001,0.01,0.1,1 , random_state=1 
- 평가지표 : R-Square

In [8]:
rfr = RandomForestRegressor()
xgboost = XGBRegressor()
lightgbm = LGBMRegressor()

param_rfr = {
    'max_depth': [3,5,10],
    'n_estimators': [100,200,300],
    'random_state':[1]
}
param_xgb = {
    'max_depth' : [3,5,10],
    'n_estimators' : [100,200,300],
    'learning_rate' : [0.001, 0.01, 0.1, 1],
    'gamma' : [0.5, 1, 2],
    'random_state' :[1]
}
param_lgbm = {
    'max_depth' : [3,5,10],
    'n_estimators' : [100,200,300],
    'learning_rate' : [0.001,0.01,0.1,1],
    'random_state' : [1]
}

gscv_rfr = GridSearchCV(estimator=rfr, param_grid = param_rfr, scoring='r2', cv=5)
gscv_xgb = GridSearchCV(estimator=xgboost, param_grid = param_xgb, scoring='r2', cv=5)
gscv_lgbm = GridSearchCV(estimator=lightgbm, param_grid = param_lgbm, scoring='r2', cv=5)

gscv_rfr.fit(train_x, train_y)
gscv_xgb.fit(train_x, train_y)
gscv_lgbm.fit(train_x, train_y)

print(f"[RandomForest] R2 score = {gscv_rfr.best_score_} / parameters = {gscv_rfr.best_params_}")
print(f"[XGBoost] R2 score = {gscv_xgb.best_score_} / parameters = {gscv_xgb.best_params_}")
print(f"[lightGBM] R2 score = {gscv_lgbm.best_score_} / parameters = {gscv_lgbm.best_params_}")

[RandomForest] R2 score = 0.7693585584046533 / parameters = {'max_depth': 10, 'n_estimators': 100, 'random_state': 1}
[XGBoost] R2 score = 0.7820826406316188 / parameters = {'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'random_state': 1}
[lightGBM] R2 score = 0.78391999861395 / parameters = {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'random_state': 1}


#### 2. 학습데이터로 평가했을 때 가장 좋은 성능을 보인 하이퍼 파라미터값의RandomForest, XGBoost, LightGBM모델을 테스트 데이터로 평가
* 평가지표 R-Square, MSE
* 반올림 하여 소수점 셋째짜리 까지 계산

In [9]:
rfr_best = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=1)
xgboost_best = XGBRegressor(gamma=0.5, learning_rate=0.1, max_depth=5, n_estimators=300, random_state=1)
lightgbm_best = LGBMRegressor(learning_rate=0.1, max_depth=10, n_estimators=100, random_state=1)

rfr_best.fit(train_x, train_y)
xgboost_best.fit(train_x, train_y)
lightgbm_best.fit(train_x, train_y)

LGBMRegressor(max_depth=10, random_state=1)

In [10]:
print("[RandomForest] R-square : {:.3f}, MSE : {:.3f}".format(r2_score(test_y, rfr_best.predict(test_x)), mean_squared_error(test_y, rfr_best.predict(test_x))))
print("[XGBoost] R-square : {:.3f}, MSE : {:.3f}".format(r2_score(test_y, xgboost_best.predict(test_x)), mean_squared_error(test_y, xgboost_best.predict(test_x))))
print("[LightGBM] R-square : {:.3f}, MSE : {:.3f}".format(r2_score(test_y, lightgbm_best.predict(test_x)), mean_squared_error(test_y, lightgbm_best.predict(test_x))))

[RandomForest] R-square : 0.762, MSE : 1526.263
[XGBoost] R-square : 0.746, MSE : 1629.751
[LightGBM] R-square : 0.759, MSE : 1544.256


## 데이터 전처리

#### 1. 데이터 불러오기

In [11]:
wine = pd.read_csv("./classification.csv")

#### 2. type값의 white, red를 숫자로 변경. 이때 white은 0 red는 1로 변경


In [12]:
wine['type']=np.where(wine['type']=='white', 0, wine['type'])
wine['type']=np.where(wine['type']=='red', 1, wine['type'])

#### 3 type을 y, 나머지 변수를 X값으로 분할


In [13]:
wx = wine.drop(['type'], axis=1, inplace=False)
wy = wine['type'].astype('int')

#### 3 X, y 를 8: 2로 데이터 분할
- stratify를 통해 y값 클래스 비율 일정하게 분할
- random_state = 1로 지정

In [14]:
train_wx, test_wx, train_wy, test_wy = train_test_split(wx, wy, train_size=0.8, random_state=1, stratify=wy)

#### 4. X 데이터 Standard Scale 진행


In [15]:
scaler = StandardScaler()
scaler.fit(train_wx)

train_wx = pd.DataFrame(scaler.transform(train_wx), columns=train_wx.columns)
test_wx = pd.DataFrame(scaler.transform(test_wx), columns=test_wx.columns)

#### 5. 학습데이터 X 통계량 출력(describe 함수 사용)

In [16]:
train_wx.describe()

Unnamed: 0,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,4397.0,4397.0,4397.0,4397.0,4397.0,4397.0,4397.0,4397.0,4397.0,4397.0,4397.0,4397.0
mean,-2.565355e-16,2.053874e-16,4.5651200000000006e-17,-1.730983e-16,1.457404e-16,9.065853000000001e-17,-5.650851e-17,-2.8885490000000004e-17,1.308361e-14,8.957028e-16,-4.13108e-16,-8.604039e-16
std,1.000114,1.000114,1.000114,1.000114,1.000114,1.000114,1.000114,1.000114,1.000114,1.000114,1.000114,1.000114
min,-3.222467,-2.598173,-1.590381,-2.241634,-1.026032,-1.378994,-1.655318,-1.94945,-2.546053,-2.980231,-2.084266,-1.756277
25%,-0.9372983,-0.6390549,-0.6574173,-0.4871902,-0.7686988,-0.5270247,-0.753084,-0.6657428,-0.8024685,-0.6798087,-0.6763825,-0.8397119
50%,0.2052857,-0.1688665,-0.2842318,-0.0661238,-0.5328097,-0.2626204,-0.07640875,0.04742779,0.05070277,-0.05807297,-0.1400458,-0.1731191
75%,0.2052857,0.3796867,0.3999417,0.4952981,0.5823022,0.23681,0.6002665,0.7071106,0.76845,0.6258363,0.4633329,0.701784
max,3.633038,6.805595,7.739257,6.390228,5.62175,11.98811,14.58489,5.788451,5.298383,4.915813,9.71514,3.65979


#### 6. PCA를 통한 변수 축약
- explained_variance_ratio_를 통해 90%가 넘어지는 시점을 기준으로 축약할 개수 선택
- 학습데이터를 이용해 진행

In [17]:
PCA_list=[]
for i in range(1,10):
    pca=sum(PCA(n_components=i).fit(train_wx).explained_variance_ratio_)
    print(i,'개의 잠재변수가 설명하는 분산은 전체 데이터의 {:.2f}%'.format(pca*100))

1 개의 잠재변수가 설명하는 분산은 전체 데이터의 25.35%
2 개의 잠재변수가 설명하는 분산은 전체 데이터의 47.31%
3 개의 잠재변수가 설명하는 분산은 전체 데이터의 61.00%
4 개의 잠재변수가 설명하는 분산은 전체 데이터의 69.87%
5 개의 잠재변수가 설명하는 분산은 전체 데이터의 76.75%
6 개의 잠재변수가 설명하는 분산은 전체 데이터의 82.41%
7 개의 잠재변수가 설명하는 분산은 전체 데이터의 87.10%
8 개의 잠재변수가 설명하는 분산은 전체 데이터의 91.54%
9 개의 잠재변수가 설명하는 분산은 전체 데이터의 95.32%


In [20]:
pca8 = PCA(n_components=8)

pca_train_wx=pca8.fit_transform(train_wx)
pca_test_wx = pca8.transform(test_wx)

#### 7. PCA 진행한 후의 학습, 테스트 데이터 X 평균 각각 계산
* 반올림 하여 소수점 셋째짜리 까지 계산

In [21]:
print("mean of train data : {:.3f}".format(np.mean(pca_train_wx)))
print("mean of test data : {:.3f}".format(np.mean(pca_test_wx)))

mean of train data : -0.000
mean of test data : -0.004


## 모델 학습 및 평가

#### 1. SVM, Decision Tree 2가지 모델에 대해 학습데이터를 사용하여 최적의 하이퍼파라미터 탐색
- GridSearchCV cv=5를 통해 탐색
- SVM 하이퍼파라미터 후보 : kernel=rbf, linear, poliy , C= 0.5, 1.5, 10, random_state=1
- Decision Tree 하이퍼파라미터 후보 : criterion : gini, entropy , max_depth=5,10,15,20,None , random_state=1
- 평가지표 : F1-Score

In [22]:
svm = SVC()
dt = tree.DecisionTreeClassifier()

param_svm = {
    'kernel' : ['rbf', 'linear', 'poly'],
    'C' : [0.5, 1.5, 10],
    'random_state': [1]
}
param_dt = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [5, 10, 15, 20, None],
    'random_state' : [1]
}

gscv_svm = GridSearchCV(svm, param_svm, scoring='f1', cv=5)
gscv_dt = GridSearchCV(estimator=dt, param_grid=param_dt, scoring='f1', cv=5)

gscv_svm.fit(pca_train_wx, train_wy)
gscv_dt.fit(train_wx, train_wy)

print(f"[SVM] F1 score = {gscv_svm.best_score_} / parameters = {gscv_svm.best_params_}")
print(f"[DT] F1 score = {gscv_dt.best_score_} / parameters = {gscv_dt.best_params_}")

[SVM] F1 score = 0.9892247157830854 / parameters = {'C': 1.5, 'kernel': 'rbf', 'random_state': 1}
[DT] F1 score = 0.9696867280306971 / parameters = {'criterion': 'entropy', 'max_depth': 10, 'random_state': 1}


#### 2. 학습데이터로 평가했을 때 가장 좋은 성능을 보인 하이퍼 파라미터값의 SVM Decision Tree 모델을 테스트 데이터로 평가
* 평가지표 Recall, Precision ,F1-Score
* 반올림 하여 소수점 셋째짜리 까지 계산

In [23]:
svm_best = SVC(C=1.5, kernel='rbf', random_state=1)
dt_best = tree.DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=1)

svm_best.fit(pca_train_wx, train_wy)
dt_best.fit(pca_train_wx, train_wy)

DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=1)

In [24]:
print("[SVM] Recall : {:.3f}, Precision : {:.3f}, F1-Score : {:.3f}".format(recall_score(test_wy, svm_best.predict(pca_test_wx)), precision_score(test_wy, svm_best.predict(pca_test_wx)), f1_score(test_wy, svm_best.predict(pca_test_wx))))
print("[DT] Recall : {:.3f}, Precision : {:.3f}, F1-Score : {:.3f}".format(recall_score(test_wy, dt_best.predict(pca_test_wx)), precision_score(test_wy, dt_best.predict(pca_test_wx)), f1_score(test_wy, dt_best.predict(pca_test_wx))))


[SVM] Recall : 0.978, Precision : 0.992, F1-Score : 0.985
[DT] Recall : 0.974, Precision : 0.960, F1-Score : 0.967
