# 랜덤포레스트

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd

cancer = load_breast_cancer()

x = pd.DataFrame(cancer.data, columns=cancer.feature_names) # input data
y = pd.DataFrame(cancer.target, columns=['class']) # target data

In [7]:
from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2 ,random_state = 0)
forest = RandomForestClassifier(n_estimators = 100, random_state = 0)
forest.fit(x_train, y_train)

  """


RandomForestClassifier(random_state=0)

In [8]:
forest.score(x_train, y_train)

1.0

In [9]:
forest.score(x_test, y_test)

0.9649122807017544

### 랜덤포레스트_와인데이터셋

In [10]:
from sklearn import datasets
raw_wine = datasets.load_wine()

In [20]:
data = pd.DataFrame(raw_wine.data, columns = raw_wine.feature_names)
target = pd.DataFrame(raw_wine.target, columns = ['class'])
df = pd.concat([data, target], axis = 1)

#### (1) train, test 데이터 분리

In [35]:
x = df.iloc[:, :-1]
y = df['class']

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2 ,random_state = 0)

In [37]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(178, 13)
(142, 13)
(36, 13)


#### (2) 데이터 변환(표준화, 정규화)

In [40]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)

print(x_train_std.shape)
print(x_test_std.shape)

(142, 13)
(36, 13)


#### (3) 모델에 데이터 넣어야 한다.

In [41]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train_std, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [48]:
y_pred = clf.predict(x_test_std)

#### (4) 모델이 제대로 학습 했는지 score 계산

In [47]:
train_score = clf.score(x_train_std, y_train)
test_score = clf.score(x_test_std, y_test)

print('train score: ', round(train_score,2))
print('test score: ', round(test_score,2))

train score:  0.99
test score:  0.94


#### (5) confusion matrix

In [49]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[14,  0,  0],
       [ 1, 14,  1],
       [ 0,  0,  6]])

#### (6) report

In [63]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       1.00      0.88      0.93        16
           2       0.86      1.00      0.92         6

    accuracy                           0.94        36
   macro avg       0.93      0.96      0.94        36
weighted avg       0.95      0.94      0.94        36



# 그레디언트 부스팅

In [73]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier

cancer = load_breast_cancer()

x = pd.DataFrame(cancer.data, columns=cancer.feature_names) # input data
y = pd.DataFrame(cancer.target, columns=['class']) # target data

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2 ,random_state = 0)

### learning_rate(default=0.1), n_estimators(default=100), max_depth(default=3)

In [75]:
gbrt = GradientBoostingClassifier(random_state = 0) 
gbrt.fit(x_train, y_train)

train_score = gbrt.score(x_train, y_train)
test_score = gbrt.score(x_test, y_test)

print('train score:', train_score)
print('test score:', test_score)

  y = column_or_1d(y, warn=True)


train score: 1.0
test score: 0.9649122807017544


#### learning_rate(default=0.1), n_estimators(default=100), max_depth(default=1)

In [81]:
gbrt = GradientBoostingClassifier(random_state = 0, max_depth = 1) 
gbrt.fit(x_train, y_train)

train_score = gbrt.score(x_train, y_train)
test_score = gbrt.score(x_test, y_test)

print('train score:', train_score)
print('test score:', test_score)

  y = column_or_1d(y, warn=True)


train score: 0.9912087912087912
test score: 0.9736842105263158


#### learning_rate(default=0.01), n_estimators(default=100), max_depth(default=3)

In [77]:
gbrt = GradientBoostingClassifier(random_state = 0, learning_rate = 0.01) 
gbrt.fit(x_train, y_train)

train_score = gbrt.score(x_train, y_train)
test_score = gbrt.score(x_test, y_test)

print('train score:', train_score)
print('test score:', test_score)

  y = column_or_1d(y, warn=True)


train score: 0.9846153846153847
test score: 0.9649122807017544


In [92]:
gbrt = GradientBoostingClassifier(random_state = 0, learning_rate = 0.01, max_depth = 3) 
gbrt.fit(x_train, y_train)

train_score = gbrt.score(x_train, y_train)
test_score = gbrt.score(x_test, y_test)

print('train score:', train_score)
print('test score:', test_score)

  y = column_or_1d(y, warn=True)


train score: 0.9846153846153847
test score: 0.9649122807017544


### Q. 붗꽃 데이터를 가지고 랜덤포레스트와 그레디언트 부스팅 모델을 만들어 성능을 비교해보자.

In [93]:
from sklearn.datasets import load_iris
import pandas as pd

iris_data = load_iris()

X = pd.DataFrame(iris_data.data, columns = iris_data.feature_names)
Y = pd.DataFrame(iris_data.target, columns = ['class'])

#### 랜덤포레스트(5시 까지)

In [100]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3 ,random_state = 0)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train_std, y_train)

train_score = clf.score(x_train_std, y_train)
test_score = clf.score(x_test_std, y_test)

print('train score: ', round(train_score,2))
print('test score: ', round(test_score,2))

train score:  0.97
test score:  0.98


  # This is added back by InteractiveShellApp.init_path()


#### 그레디언트 부스팅(25m)

In [101]:
gbrt = GradientBoostingClassifier(random_state = 0, learning_rate = 0.01) 
gbrt.fit(x_train_std, y_train)

train_score = gbrt.score(x_train_std, y_train)
test_score = gbrt.score(x_test_std, y_test)

print('train score:', train_score)
print('test score:', test_score)

train score: 1.0
test score: 0.9777777777777777


  y = column_or_1d(y, warn=True)
