In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()

x_train, x_test, y_train, y_test = train_test_split(cancer['data'],
                                                    cancer['target'],
                                                    stratify=cancer['target'],
                                                    random_state=0)
cancer['target_names']

array(['malignant', 'benign'], dtype='<U9')

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

In [3]:
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr),
                         ('dt3', dt3), ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr),
                         ('dt3', dt3), ('dt5', dt5)], voting='soft')

In [4]:
from sklearn.ensemble import VotingRegressor
VotingRegressor?

In [5]:
names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']
for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_train, y_train)
    name = names[idx]
    train_score = model.score(x_train, y_train) * 100
    test_score = model.score(x_test, y_test) * 100
    print(f'{name} Train Accuracy:{train_score:.2f}%')
    print(f'{name} Test Accuracy:{test_score:.2f}%')
    print()

hard Train Accuracy:98.12%
hard Test Accuracy:95.10%

soft Train Accuracy:99.53%
soft Test Accuracy:95.80%

knn1 Train Accuracy:94.60%
knn1 Test Accuracy:91.61%

knn2 Train Accuracy:95.77%
knn2 Test Accuracy:91.61%

lr Train Accuracy:96.71%
lr Test Accuracy:93.71%

dt3 Train Accuracy:97.65%
dt3 Test Accuracy:93.01%

dt5 Train Accuracy:100.00%
dt5 Test Accuracy:93.01%



In [6]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5).fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

(1.0, 0.951048951048951)

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

(1.0, 0.951048951048951)

In [8]:
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()), 
              ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                         final_estimator=LogisticRegression())

model.fit(x_train, y_train).score(x_test, y_test)

0.958041958041958

In [9]:
StackingClassifier?

In [11]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.3.3-py3-none-win_amd64.whl (95.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.3.3


In [12]:
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
    
boston = load_boston()

x_train, x_test, y_train, y_test = train_test_split(boston['data'],
                                                    boston['target'],
                                                    random_state=0)
model = xgb.XGBRegressor(objective ='reg:linear')
model.fit(x_train, y_train)

p_train = model.predict(x_train)
p_test = model.predict(x_test)

r2_score(y_train, p_train), r2_score(y_test, p_test)



(0.999999003030776, 0.7476326752660457)

In [14]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.1.1-py2.py3-none-win_amd64.whl (754 kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.1.1


In [15]:
import lightgbm as lgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

boston = load_boston()

x_train, x_test, y_train, y_test = train_test_split(boston['data'],
                                                    boston['target'],
                                                    random_state=0)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

params = {
    'objective': 'regression',
}

model = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                  early_stopping_rounds=5)

p_train = model.predict(x_train, num_iteration=model.best_iteration)
p_test = model.predict(x_test, num_iteration=model.best_iteration)

r2_score(y_train, p_train), r2_score(y_test, p_test)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 973
[LightGBM] [Info] Number of data points in the train set: 379, number of used features: 13
[LightGBM] [Info] Start training from score 22.608707
[1]	valid_0's l2: 70.5396
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 61.4369
[3]	valid_0's l2: 53.753
[4]	valid_0's l2: 47.6561
[5]	valid_0's l2: 42.6525
[6]	valid_0's l2: 38.518
[7]	valid_0's l2: 35.5033
[8]	valid_0's l2: 33.0258
[9]	valid_0's l2: 31.2533
[10]	valid_0's l2: 29.944
[11]	valid_0's l2: 28.4314
[12]	valid_0's l2: 27.6605
[13]	valid_0's l2: 26.7994
[14]	valid_0's l2: 26.3312
[15]	valid_0's l2: 25.3739
[16]	valid_0's l2: 25.03
[17]	valid_0's l2: 24.2976
[18]	valid_0's l2: 24.0748
[19]	valid_0's l2: 23.5555
[20]	valid_0's l2: 23.5175
[21]	valid_0's l2: 23.1308
[22]	valid_0's l2: 23.2182
[23]	valid_0's l2: 23.3182
[24]	valid_0's l2: 23.0598
[25]	v

(0.9524522112992748, 0.7369037475494296)

In [None]:
# 데이터 세트 준비
from sklearn.datasets import load_digits


# 데이터 분할


# 모델 평가 (가장 좋은 Classification 모델을 찾아보세요.)



In [16]:
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
boston = load_boston()

model = GradientBoostingRegressor(random_state=0)

params = {
    'n_estimators' : [100, 300, 500],
    'learning_rate' : [0.01, 0.01, 0.1],
    'max_depth' : [3, 4, 5],
}

gs = GridSearchCV(model, params).fit(boston.data, boston.target)
gs

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
              

In [17]:
import pandas as pd
report = pd.DataFrame(gs.cv_results_)
report

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.08577,0.001784,0.000598,0.0004886555,0.01,3,100,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.62202,0.688112,0.220861,0.325642,-0.175309,0.336265,0.310029,24
1,0.25412,0.003796,0.000997,9.536743e-08,0.01,3,300,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.756358,0.865449,0.691776,0.48463,0.314734,0.622589,0.197722,9
2,0.421473,0.0066,0.000997,3.16298e-07,0.01,3,500,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.762956,0.867336,0.738485,0.535428,0.38054,0.656949,0.17511,4
3,0.105722,0.0019,0.000399,0.0004886361,0.01,4,100,"{'learning_rate': 0.01, 'max_depth': 4, 'n_est...",0.633338,0.626206,0.314555,0.374583,-0.252105,0.339315,0.322612,22
4,0.315756,0.004575,0.000997,1.907349e-07,0.01,4,300,"{'learning_rate': 0.01, 'max_depth': 4, 'n_est...",0.756735,0.807186,0.656688,0.506602,0.120894,0.569621,0.246721,13
5,0.527382,0.010449,0.001995,1.885443e-05,0.01,4,500,"{'learning_rate': 0.01, 'max_depth': 4, 'n_est...",0.768721,0.808021,0.693576,0.544797,0.222282,0.607479,0.212572,11
6,0.127469,0.004255,0.000993,9.066178e-06,0.01,5,100,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.634823,0.557666,0.292459,0.377049,-0.240186,0.324362,0.307677,26
7,0.379379,0.011713,0.001397,0.0004892256,0.01,5,300,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.741062,0.762564,0.582336,0.47783,0.031606,0.51908,0.26529,20
8,0.619743,0.010405,0.001995,0.0006306003,0.01,5,500,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.75303,0.762199,0.622824,0.498162,0.105948,0.548433,0.241416,15
9,0.08597,0.001589,0.000798,0.0003990656,0.01,3,100,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.62202,0.688112,0.220861,0.325642,-0.175309,0.336265,0.310029,24


In [18]:
gs.best_params_ 

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

In [19]:
gs.best_score_

0.6697600256867121

In [20]:
gs.best_estimator_

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=0, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)