## 앙상블모델(Ensemble)

##### - 독립변수 : 특성을 자유롭게 조합해 가면서 검증해보기
##### ...(특성을 자유롭게 조합하여 적용해 가면서 정확도 차이 확인)
##### ...(전체 특성을 모두 적용도 해보세요)
##### - 사용한 모델별로, 예측(predict)을 통한 결과를, 데이터프레임에 col_predict 컬럼명으로 추가하기

In [197]:
# [문제] 5시 30분까지 [구글드라이브 > 산출물 > 02_분류(와인)] 제출
# 와인 데이터 사용
# 와인의 화학 조성을 사용하여 와인의 종류 예측 (자유롭게)

# ** 특성 이름을 담고 있는 key 값 = feature_names
# ** 특성 데이터를 담고 있는 key 값 = data
# ** 범주 와인의 종류를 담고 있는 key 값 = target_names
#   - 범주는 'class_0'과 'class_1'만 사용 (0과 1로 변경하여 사용)
#   - (0=레드와인, 1=화이트와인)

# 알콜(alcohol)
# 말산(malic_acid)
# 회분(ash)
# 회분의 알칼리도(alcalinity_of_ash)
# 마그네슘(magnesium)
# 총 폴리페놀(total_phenols)
# 플라보노이드 폴리페놀(flavanoids)
# 비 플라보노이드 폴리페놀(nonflavanoid_phenols)
# 프로안토시아닌(proanthocyanins)
# 색상의 강도(color_intensity)
# 색상(hue)
# 희석 와인의 0D280/0D315 비율 (od280/od315_of_diluted_wines)
# 프롤린 (proline)

## 앙상블 모델 종류
### 1. 랜덤포레스트(Random Forest)
### 2. 엑스트라 트리(Extra Trees)
### 3. 그레디언트 부스팅(Gradient Boosting)
### 4. 히스토그램 기반 그레디언트 부스팅(Histogram-base Gradient Boosting)

In [198]:
import numpy as np
import pandas as pd

## 1. 데이터 준비

In [199]:
from sklearn.datasets import load_wine
wine_all = load_wine()
wine_all.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [200]:
wine = pd.DataFrame(data=wine_all["data"])
wine.columns = wine_all["feature_names"]
wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [201]:
wine["class"] = wine_all["target"]
wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [202]:
df = wine[wine["class"] != 2]

## 2. 전처리

In [203]:
data = df[['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
           'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
           'proanthocyanins', 'color_intensity', 'hue',
           'od280/od315_of_diluted_wines', 'proline']].to_numpy()
target = df["class"].to_numpy()

## 3. 훈련, 테스트 모델 구분하기

In [209]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = \
    train_test_split(data, target, random_state=42)

print(train_input.shape, train_target.shape)
print(test_input.shape, test_target.shape)


(97, 13) (97,)
(33, 13) (33,)


## 1. 랜덤포레스트(Random Forest)

In [215]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=42)

scores = cross_validate(rf, train_input, train_target,
                       return_train_score=True, n_jobs=-1)

scores

print(np.mean(scores["train_score"]), np.mean(scores["test_score"]))

1.0 0.9800000000000001


In [217]:
rf.fit(train_input, train_target)

rf.feature_importances_

array([0.25797416, 0.01796373, 0.02576245, 0.03770346, 0.09345008,
       0.02400221, 0.12840886, 0.01058505, 0.01141028, 0.13119712,
       0.01034972, 0.0185842 , 0.23260868])

In [220]:
### oob(out of back)
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)

rf.fit(train_input, train_target)

print(rf.oob_score_)

0.9896907216494846


## 2. 엑스트라 트리(Extra tree)

In [232]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_jobs=-1, random_state=42)

scores = cross_validate(et, train_input, train_target,
                       return_train_score=True, n_jobs=-1)

scores

print(np.mean(scores["train_score"]), np.mean(scores["test_score"]))

1.0 0.9800000000000001


In [234]:
et.fit(train_input, train_target)

print(et.feature_importances_)

[0.1962303  0.03211827 0.02784845 0.03858197 0.06318405 0.06798185
 0.09340209 0.02839892 0.02447009 0.13508022 0.02039693 0.0327477
 0.23955915]
