In [26]:
# california 집값 데이터 사용
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

X = housing.data
y = housing.target

print(X.shape, y.shape)

(20640, 8) (20640,)


### 여러 앙상블 기법으로 regression 진행

In [27]:
# Voting
# voting에 사용할 모델으로 linear regression, decision tree, k-neighbors regressor 사용
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score

fold = KFold(n_splits=5, shuffle=True)

for i, (idx_train, idx_test) in enumerate(fold.split(X,y)):
    x_train, y_train = X[idx_train], y[idx_train]
    x_test, y_test = X[idx_test], y[idx_test]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    # model 들 각각 불러옴.
    LR = LinearRegression()
    DT = DecisionTreeRegressor()
    kNN = KNeighborsRegressor(n_neighbors=5)

    models = [('LR', LR), ('DT', DT), ('kNN', kNN)]
    Regressor = VotingRegressor(models) # Voting regressor 공식문서를 보면 list 안에 model 의 tuple 로 받도록 되어있음.
    Regressor.fit(x_train, y_train)
    y_pred = Regressor.predict(x_test)
    print(f"========= fold {i} =========")
    print("accuracy:", Regressor.score(x_test, y_test)) # [model].score() 함수를 사용하면 accuracy 계산 가능
    print("r2 score:", r2_score(y_pred, y_test))

    for name, model in models:
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print(name, ":", r2_score(y_pred, y_test))

    print()

accuracy: 0.7284408880302589
r2 score: 0.5991950937908306
LR : 0.3528455651786059
DT : 0.5914820544213287
kNN : 0.5519345519469465

accuracy: 0.7388960454680915
r2 score: 0.6048908447942001
LR : 0.3367389069106439
DT : 0.611744152607385
kNN : 0.5746162708622577

accuracy: 0.7341269585241172
r2 score: 0.5982385322143579
LR : 0.314803004834594
DT : 0.6151927064039966
kNN : 0.5509517179429739

accuracy: 0.7517311947917237
r2 score: 0.6371260656324671
LR : 0.3869781593572367
DT : 0.622327091442426
kNN : 0.5920311304343409

accuracy: 0.7455347459244566
r2 score: 0.6227759388022472
LR : 0.3513465726227005
DT : 0.6287131221508351
kNN : 0.566667703024986



In [28]:
# AdaBoost
from sklearn.ensemble import AdaBoostRegressor

fold = KFold(n_splits=5, shuffle=True)

for i, (idx_train, idx_test) in enumerate(fold.split(X,y)):
    x_train, y_train = X[idx_train], y[idx_train]
    x_test, y_test = X[idx_test], y[idx_test]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    AdaBoost = AdaBoostRegressor(n_estimators=50) # n_estimators : 훈련에서 사용할 model 개수
    AdaBoost.fit(x_train, y_train)
    y_pred = AdaBoost.predict(x_test)
    print(f"========= fold {i} =========")
    print("r2 score:", r2_score(y_pred, y_test))
    print()

r2 score: -1.4611150875451786

r2 score: -0.4720275600177055

r2 score: -0.6734678981490481

r2 score: -0.7534747451072303

r2 score: -1.3102766782953923



In [29]:
# GradientBoost
from sklearn.ensemble import GradientBoostingRegressor

fold = KFold(n_splits=5, shuffle=True)

for i, (idx_train, idx_test) in enumerate(fold.split(X,y)):
    x_train, y_train = X[idx_train], y[idx_train]
    x_test, y_test = X[idx_test], y[idx_test]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    GradientBoost = GradientBoostingRegressor()
    GradientBoost.fit(x_train, y_train)
    y_pred = GradientBoost.predict(x_test)
    print(f"========= fold {i} =========")
    print("r2 score:", r2_score(y_pred, y_test))
    print()

r2 score: 0.7328238183417817

r2 score: 0.7120283673825718

r2 score: 0.6947183395328868

r2 score: 0.7350621777552143

r2 score: 0.7129894304649047



# 과제

In [30]:
# 당뇨 예측 데이터 사용
import pandas as pd

data = pd.read_csv("./data/diabetes_prediction_dataset.csv")
data.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


### 여기서부터 과제

### Ensemble 기법으로 classification 성능 측정

In [31]:
# 숫자가 아닌 column이 존재하기 때문에 해당 column은 encoding 진행
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
# encoder를 사용해서 각 column에 대해 fit_transform() 적용
data['gender'] = encoder.fit_transform(data['gender'])
data['smoking_history'] = encoder.fit_transform(data['smoking_history'])

In [32]:
# X와 y로 나누기
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

### 세가지 Ensemble 기법 사용
### 각 기법에 대해 k-fold cross validation을 진행하고, StratifiedKFfold를 사용, k=5

#### Voting

In [33]:
# Voting
# voting에는 logistic regression, decision tree, k-neighbors classifier 모델 사용
# 성능 측정에는 accuracy를 사용
# decision tree 사용시 max_depth는 5로 고정
# knn 사용시 k값은 5로 고정
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

fold = KFold(n_splits=5, shuffle=True)

for i, (idx_train, idx_test) in enumerate(fold.split(X,y)):
    x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    x_test, y_test = X.iloc[idx_test], y.iloc[idx_test]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    # model 들 각각 불러옴.
    LR = LogisticRegression()
    DT = DecisionTreeClassifier()
    kNN = KNeighborsClassifier(n_neighbors=5)

    models = [('LR', LR), ('DT', DT), ('kNN', kNN)]
    Classifier = VotingClassifier(models)
    Classifier.fit(x_train, y_train)
    y_pred = Regressor.predict(x_test)
    print(f"========= fold {i} =========")
    print("accuracy:", Classifier.score(x_test, y_test)) # [model].score() 함수를 사용하면 accuracy 계산 가능

    for name, model in models:
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print(name, ":", model.score(x_test, y_test))

    print()

accuracy: 0.96495
LR : 0.95995
DT : 0.94975
kNN : 0.9598

accuracy: 0.96455
LR : 0.959
DT : 0.95355
kNN : 0.9603

accuracy: 0.96445
LR : 0.95895
DT : 0.9498
kNN : 0.9617

accuracy: 0.96745
LR : 0.96165
DT : 0.95255
kNN : 0.96265

accuracy: 0.9655
LR : 0.9612
DT : 0.9514
kNN : 0.96145



#### AdaBoost

In [34]:
# AdaBoost
# 각 fold의 accuracy가 96 이상이 나오도록 parameter 조정
from sklearn.ensemble import AdaBoostClassifier

fold = KFold(n_splits=5, shuffle=True)

for i, (idx_train, idx_test) in enumerate(fold.split(X,y)):
    x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    x_test, y_test = X.iloc[idx_test], y.iloc[idx_test]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    AdaBoost = AdaBoostClassifier(n_estimators=50) # n_estimators : 훈련에서 사용할 model 개수
    AdaBoost.fit(x_train, y_train)
    y_pred = AdaBoost.predict(x_test)
    print(f"========= fold {i} =========")
    print("accuracy:", AdaBoost.score(x_test, y_test))
    print()


accuracy: 0.9713

accuracy: 0.97165

accuracy: 0.97255

accuracy: 0.97245

accuracy: 0.9714



#### GradientBoost

In [35]:
# GradientBoost
# 각 fold의 accuracy가 96 이상이 나오도록 parameter 조정
from sklearn.ensemble import GradientBoostingClassifier

fold = KFold(n_splits=5, shuffle=True)

for i, (idx_train, idx_test) in enumerate(fold.split(X,y)):
    x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    x_test, y_test = X.iloc[idx_test], y.iloc[idx_test]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    GradientBoost = GradientBoostingClassifier()
    GradientBoost.fit(x_train, y_train)
    y_pred = GradientBoost.predict(x_test)
    print(f"========= fold {i} =========")
    print("accuracy:", GradientBoost.score(x_test, y_test))
    print()


accuracy: 0.97195

accuracy: 0.9705

accuracy: 0.97175

accuracy: 0.97265

accuracy: 0.97375

