### 정형데이터: 우리가 지금까지 다른 데이터 숫자로 되어 있는 계산할 수 있는 데이터
### 비정형데이터: 이미지, 사운드, 텍스트
### 정형데이터를 가지고 가장 뛰어난 성능을 내는 머신러닝 알고리즘: 앙상블
### 랜덤 포레스트: 앙상블의 대표주자
### tree: random data
### 부트스트랩 샘플: 데이터세트에서 한개식 랜덤하게 선택해서 훈련데이터셋을 만드는데 중복이 될 수 있고 훈련세트는 데이터세트와 같게 만든다

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
url = 'https://raw.githubusercontent.com/leekyuyoung20221226/python/main/data/wine_data'
wine = pd.read_csv(url)
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [4]:
X = wine.iloc[:, :-1].to_numpy()
Y = wine.iloc[:, -1].to_numpy()

In [5]:
X.shape, Y.shape

((6497, 3), (6497,))

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=0)

In [7]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, random_state=0)

In [8]:
scores = cross_validate(rf,x_train,y_train, return_train_score=True)

In [9]:
scores

{'fit_time': array([6.17598319, 0.13730788, 0.1356461 , 0.14278889, 0.15642095]),
 'score_time': array([0.03966546, 0.03094363, 0.02910733, 0.0291357 , 0.03311038]),
 'test_score': array([0.88269231, 0.86826923, 0.88739172, 0.89027911, 0.88931665]),
 'train_score': array([0.99807554, 0.99831609, 0.9973545 , 0.9973545 , 0.9978355 ])}

In [10]:
np.mean(scores['train_score']), np.mean(scores['test_score'])

(0.9977872242245589, 0.883589805286148)

In [11]:
# 특성 중요도
rf.fit(x_train, y_train)
rf.feature_importances_, wine.columns[:-1]

(array([0.23029487, 0.5028204 , 0.26688473]),
 Index(['alcohol', 'sugar', 'pH'], dtype='object'))

In [12]:
# 랜덤포레스트 vs 디시전 트리
# 랜덤하게 피처(특성, 컬럼)를 선택하기때문에 한쪽에 쏠리는 현상을 방지한다 좀더 많은 특성에 기회를 부여
# 자체적으로 모델을 평가하는 기능, 중복을 허용해서 부트스트랩 샘플을 만들면 
rf = RandomForestClassifier(n_jobs=-1, random_state=0, oob_score=True)
rf.fit(x_train, y_train)
rf.oob_score_

0.8926303636713488

# 엑스트라 트리
    100개의 트리를 훈련
    전체특징중에 일부를 랜덤하게 선택해서 노드를 분할하는데 사용
    부트스트랩 샘플을 사용하지 않는다 - 전체 데이터를 사용, 노드분할을 할때 무작위(최적의 노드를 찾는 것이 아니다)
    특성을 무작위로 분할, 성능은 낮아지지만 과적합을 예방할 수 있음

In [13]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(n_jobs=-1, random_state=0)
scores = cross_validate(et, x_train, y_train, return_train_score=True)
np.mean(scores['train_score']), np.mean(scores['test_score'])

(0.9978834474624707, 0.880317428000296)

In [14]:
# 수행속도가 빠르다
et.fit(x_train, y_train)
et.feature_importances_

array([0.20298962, 0.51938012, 0.27763026])

# 그레이던트 부스팅
    숲을 이루는 결정트리를 만들때, 깉이가 얕은 트리를 사용
    기본이 깊이가 3인 트리 100개를 사용
    과적합에 강하다
    경사하강법 사용
    분류: 로지스틱 손실 함수
    회귀: 평균 제곱 오차

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
cross_validate(gbc, x_train, y_train, return_train_score=True, n_jobs=-1)
np.mean(scores['train_score']), np.mean(scores['test_score'])

(0.9978834474624707, 0.880317428000296)

In [16]:
gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=0.2, random_state=0)
cross_validate(gbc, x_train, y_train, return_train_score=True, n_jobs=-1)
np.mean(scores['train_score']), np.mean(scores['test_score'])

(0.9978834474624707, 0.880317428000296)

In [17]:
gbc.fit(x_train, y_train)
gbc.feature_importances_

array([0.1550395 , 0.68721774, 0.15774276])

### 지금까지 숲을 구성할 때 사용한 트리는 원래 훈련 데이터 전부 사용
### subsample = 1.0 전체 데이터 사용
### 이 값을 1.0보다 작게 만들면 일부 데이터만 사용 - 일부 셈플만 랜덤하게 선택
### 확률적 경사하강법 or 미니배치 경사 하강법과 비슷하게 된다
### 랜점포레스트보다 느리다

# 히스토그램 기반 그레이디언트 부스팅
    그레이디언트 부스팅의 속도와 성능을 개선한 알고리즘

In [18]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
hbc = HistGradientBoostingClassifier(random_state=0)
scores=cross_validate(hbc,x_train,y_train,return_train_score=True)
np.mean(scores['train_score']), np.mean(scores['test_score'])



(0.9284684595245096, 0.8737748944991486)

In [20]:
# 중요도
from sklearn.inspection import permutation_importance
hbc.fit(x_train, y_train)
result = permutation_importance(hbc, x_train, y_train, n_repeats=10, random_state=0, n_jobs=-1)
result['importances_mean']

array([0.09184145, 0.24083125, 0.08324033])

In [1]:
import sklearn
print(sklearn.__version__)

1.2.1


In [28]:
# XGBoost 그레이디언트 부스팅 알고리즘을 사용한 모델(sklearn이 아니다)
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=0)
scores = cross_validate(xgb, x_train, y_train, return_train_score=True)
np.mean(scores['train_score']), np.mean(scores['test_score'])

(0.9517029349360356, 0.875314466572888)

In [29]:
from lightgbm import LGBMClassifier
lgb= LGBMClassifier(random_state=0)
scores = cross_validate(lgb, x_train, y_train, return_train_score=True)
np.mean(scores['train_score']), np.mean(scores['test_score'])

(0.9320281523553113, 0.876660250240616)

# 자전거 대여량 예측

## 1. 데이터 확보

In [30]:
url = 'https://raw.githubusercontent.com/leekyuyoung20221226/python/main/data/bike.csv'

In [32]:
bike = pd.read_csv(url)
bike.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


## 2. 데이터 전처리
    결측치 확인 - 중간값이나 평균으로 대처

In [34]:
bike.isnull().sum()

instant       0
dteday        0
season        0
yr            1
mnth          1
holiday       0
weekday       0
workingday    0
weathersit    0
temp          1
atemp         1
hum           3
windspeed     5
casual        0
registered    0
cnt           0
dtype: int64

In [45]:
bike[bike.isnull().any(axis=1)]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,month
0,1,2011-01-01,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985,
1,2,2011-01-02,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801,
2,3,2011-01-03,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,
3,4,2011-01-04,1.0,0.0,1.0,0.0,2.0,1.0,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562,
4,5,2011-01-05,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,726,2012-12-26,1.0,1.0,12.0,0.0,3.0,1.0,3,0.243333,0.220333,0.823333,0.316546,9,432,441,
726,727,2012-12-27,1.0,1.0,12.0,0.0,4.0,1.0,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114,
727,728,2012-12-28,1.0,1.0,12.0,0.0,5.0,1.0,2,0.253333,0.255046,0.590000,0.155471,644,2451,3095,
728,729,2012-12-29,1.0,1.0,12.0,0.0,6.0,0.0,2,0.253333,0.242400,0.752917,0.124383,159,1182,1341,


In [39]:
# 중간값으로 채우기
bike['windspeed'].fillna(bike['windspeed'].median(),inplace=True)

In [42]:
bike.loc[730,'yr'] = 1.0
bike.loc[730, 'mnth'] = 12.0

In [44]:
bike['temp'].fillna( bike['temp'].median(), inplace=True)
bike['atemp'].fillna( bike['atemp'].median(), inplace=True)
bike['hum'].fillna( bike['hum'].median(), inplace=True)

In [46]:
bike.drop(columns=['dteday'],inplace=True)

In [47]:
bike.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,month
0,1,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985,
1,2,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801,
2,3,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,
3,4,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,108,1454,1562,
4,5,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600,
