# Advanced Machine Learning 

In [420]:
# Load dataset (iris), split into X_train, y_train, X_test, y_test!
# Write your code here 
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
import pandas as pd
warnings.filterwarnings("ignore")

breast_cancer = datasets.load_breast_cancer()

X = breast_cancer.data
y = breast_cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [373]:
df.head(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


## 1. Voting 

In [374]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Hint: 
# clf_voting = VotingClassifier( estimators=[('label1', clf_1),
#('label2', clf_2),
#('labelN', clf_N)]) 

# Create the individual models
clf_knn = KNeighborsClassifier(5)
clf_dt = DecisionTreeClassifier()
clf_lr = LogisticRegression()

# Create voting classifier
clf_voting = VotingClassifier(estimators=[
('knn', clf_knn),
('dt', clf_dt),
('lr', clf_lr)])

# Fit it to the training set and predict
clf_voting.fit(X_train, y_train)
y_pred = clf_voting.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc))

Accuracy: 0.942


In [375]:
print('Voting Classifier \n',classification_report(y_test, clf_voting.predict(X_test)))

Voting Classifier 
               precision    recall  f1-score   support

           0       0.95      0.89      0.92        64
           1       0.94      0.97      0.95       107

    accuracy                           0.94       171
   macro avg       0.94      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171



In [376]:
clf_knn.fit(X_train, y_train)
y_pred = clf_knn.predict(X_test)

print('KNN Classifier \n',classification_report(y_test, clf_knn.predict(X_test)))

KNN Classifier 
               precision    recall  f1-score   support

           0       0.90      0.89      0.90        64
           1       0.94      0.94      0.94       107

    accuracy                           0.92       171
   macro avg       0.92      0.92      0.92       171
weighted avg       0.92      0.92      0.92       171



In [377]:
clf_dt.fit(X_train, y_train)
y_pred = clf_dt.predict(X_test)

print('Decision Tree Classifier \n',classification_report(y_test, clf_dt.predict(X_test)))

Decision Tree Classifier 
               precision    recall  f1-score   support

           0       0.92      0.89      0.90        64
           1       0.94      0.95      0.94       107

    accuracy                           0.93       171
   macro avg       0.93      0.92      0.92       171
weighted avg       0.93      0.93      0.93       171



In [378]:
clf_lr.fit(X_train, y_train)
y_pred = clf_lr.predict(X_test)

print('Linear Regression \n',classification_report(y_test, clf_lr.predict(X_test)))

Linear Regression 
               precision    recall  f1-score   support

           0       0.97      0.88      0.92        64
           1       0.93      0.98      0.95       107

    accuracy                           0.94       171
   macro avg       0.95      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171



## 2. Averaging 

In [379]:
# Template for averaging Classifier 

from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# clf_voting = VotingClassifier(
# estimators=[
#('label1', clf_1),
#('label2', clf_2),
#...
#('labelN', clf_N)],
#voting='soft',
#weights=[w_1, w_2, ..., w_N]
#)

In [380]:
# Initiate the individual models 

# Write your code here! 

clf_knn = KNeighborsClassifier(3)
clf_dt = DecisionTreeClassifier()
clf_lr = LogisticRegression()

clf_voting = VotingClassifier(estimators=[('knn', clf_knn),
                                          ('dt', clf_dt),
                                          ('lr', clf_lr)],
                             voting='soft',
                             weights=[1,1,2])

# Create averaging classifier

# Write your code here! 

# Fit it to the training set and predict
clf_voting.fit(X_train, y_train)
y_pred = clf_voting.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc))

Accuracy: 0.953


In [381]:
clf_voting.fit(X_train, y_train)
y_pred = clf_voting.predict(X_test)

print('Averaging Classifier \n',classification_report(y_test, clf_voting.predict(X_test)))

Averaging Classifier 
               precision    recall  f1-score   support

           0       1.00      0.88      0.93        64
           1       0.93      1.00      0.96       107

    accuracy                           0.95       171
   macro avg       0.97      0.94      0.95       171
weighted avg       0.96      0.95      0.95       171



## 3. Bagging

In [382]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

bagging = BaggingClassifier(KNeighborsClassifier(3),
                           max_samples=0.5, max_features=0.5)

# Fit it to the training set and predict
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc))

Accuracy: 0.918


In [383]:
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

print('Bagging Classifier \n',classification_report(y_test, bagging.predict(X_test)))

Bagging Classifier 
               precision    recall  f1-score   support

           0       0.93      0.88      0.90        64
           1       0.93      0.96      0.94       107

    accuracy                           0.93       171
   macro avg       0.93      0.92      0.92       171
weighted avg       0.93      0.93      0.93       171



In [384]:
# Write your code here if base classifier = decision tree!
from sklearn.tree import DecisionTreeClassifier

bagging = BaggingClassifier(DecisionTreeClassifier(),
                           max_samples=0.5, max_features=0.5)

# Fit it to the training set and predict
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc))

Accuracy: 0.942


In [385]:
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

print('Averaging Classifier \n',classification_report(y_test, bagging.predict(X_test)))

Averaging Classifier 
               precision    recall  f1-score   support

           0       0.94      0.91      0.92        64
           1       0.94      0.96      0.95       107

    accuracy                           0.94       171
   macro avg       0.94      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171



In [386]:
# Write your code here if you use RandomForest, compare with above!

from sklearn.ensemble import RandomForestClassifier

bagging = BaggingClassifier(RandomForestClassifier(),
                           max_samples=0.5, max_features=0.5)

# Fit it to the training set and predict
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc))

Accuracy: 0.936


In [387]:
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

print('Averaging Classifier \n',classification_report(y_test, bagging.predict(X_test)))

Averaging Classifier 
               precision    recall  f1-score   support

           0       0.92      0.91      0.91        64
           1       0.94      0.95      0.95       107

    accuracy                           0.94       171
   macro avg       0.93      0.93      0.93       171
weighted avg       0.94      0.94      0.94       171



## 4. Boosting
Source: https://scikit-learn.org/stable/modules/ensemble.html

In [388]:
from sklearn.ensemble import AdaBoostClassifier
# clf_ada = AdaBoostClassifier(
# base_estimator,
# n_estimators,
# learning_rate
# )

# base_estimator
# Default: Decision Tree (max_depth=1)
# n_estimators
# Default: 50
# learning_rate
# Default: 1.0
# Trade-off between n_estimators and
# learning_rate

### Create AdaBoost Classifier for iris dataset!

In [389]:
# Write your code here!
clf_ada = AdaBoostClassifier(
DecisionTreeClassifier(),
n_estimators=100,
learning_rate=0.2
)

In [390]:
# Fit it to the training set and predict
clf_ada.fit(X_train, y_train)
y_pred = clf_ada.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc),'\n')
print('AdaBoost Classifier \n',classification_report(y_test, clf_ada.predict(X_test)))

Accuracy: 0.924 

AdaBoost Classifier 
               precision    recall  f1-score   support

           0       0.92      0.88      0.90        64
           1       0.93      0.95      0.94       107

    accuracy                           0.92       171
   macro avg       0.92      0.91      0.92       171
weighted avg       0.92      0.92      0.92       171



In [391]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LogisticRegression

reg_ada = AdaBoostRegressor(
LogisticRegression(),
n_estimators=100,
learning_rate=0.1,
loss='square'
)

# base_estimator
# Default: Decision Tree (max_depth=3)
# loss
# linear (default)
# square
# exponential

In [392]:
diabetes = datasets.load_diabetes()

X = diabetes.data
y = diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [393]:
# Fit it to the training set and predict
reg_ada.fit(X_train, y_train)
y_pred = reg_ada.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc),'\n')
print('AdaBoost Regressor \n',classification_report(y_test, reg_ada.predict(X_test)))

Accuracy: 0.000 

AdaBoost Regressor 
               precision    recall  f1-score   support

        37.0       0.00      0.00      0.00       1.0
        42.0       0.00      0.00      0.00       1.0
        48.0       0.00      0.00      0.00       2.0
        52.0       0.00      0.00      0.00       3.0
        53.0       0.00      0.00      0.00       1.0
        59.0       0.00      0.00      0.00       1.0
        60.0       0.00      0.00      0.00       2.0
        61.0       0.00      0.00      0.00       2.0
        63.0       0.00      0.00      0.00       3.0
        64.0       0.00      0.00      0.00       1.0
        65.0       0.00      0.00      0.00       1.0
        67.0       0.00      0.00      0.00       1.0
        68.0       0.00      0.00      0.00       1.0
        69.0       0.00      0.00      0.00       1.0
        70.0       0.00      0.00      0.00       1.0
        71.0       0.00      0.00      0.00       1.0
        72.0       0.00      0.00      0.0

In [394]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boo = GradientBoostingClassifier()

In [395]:
breast_cancer = datasets.load_breast_cancer()

X = breast_cancer.data
y = breast_cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [396]:
# Fit it to the training set and predict
grad_boo.fit(X_train, y_train)
y_pred = grad_boo.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc),'\n')
print('GradientBoost Classifier \n',classification_report(y_test, grad_boo.predict(X_test)))

Accuracy: 0.942 

GradientBoost Classifier 
               precision    recall  f1-score   support

           0       0.95      0.89      0.92        64
           1       0.94      0.97      0.95       107

    accuracy                           0.94       171
   macro avg       0.94      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171



### Create GradientBoostingClassifier for iris Dataset!

In [397]:
# Write your code here!

In [398]:
# import xgboost as xgb => ini mesti install dulu 
# import lightgbm as lgb
# import catboost as cb
# Cek di sini: 
# https://stackoverflow.com/questions/35139108/how-to-install-xgboost-in-anaconda-python-windows-platform

In [399]:
df_train = df_1.loc[:(len(df_1)*0.8)]

In [400]:
df_test = df_1.loc[(len(df_1)*0.8):]

In [422]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()

In [423]:
xgbc.fit(X_train, y_train)
y_pred = xgbc.predict(X_test)
predictions = [round(value) for value in y_pred]

# Get the accuracy score
acc = accuracy_score(y_test, predictions)
print("Accuracy: {:0.3f}".format(acc),'\n')
print('XGBoost Regressor \n',classification_report(y_test, predictions))

Accuracy: 0.953 

XGBoost Regressor 
               precision    recall  f1-score   support

           0       0.97      0.91      0.94        64
           1       0.95      0.98      0.96       107

    accuracy                           0.95       171
   macro avg       0.96      0.94      0.95       171
weighted avg       0.95      0.95      0.95       171



In [403]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

lgb_model = lgb.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

y_pred = lgb_model.predict(X_test)

[1]	valid_0's auc: 0.935967
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.964369
[3]	valid_0's auc: 0.96123
[4]	valid_0's auc: 0.97021
[5]	valid_0's auc: 0.977439
[6]	valid_0's auc: 0.977731
[7]	valid_0's auc: 0.978315
[8]	valid_0's auc: 0.977293
[9]	valid_0's auc: 0.97868
[10]	valid_0's auc: 0.981162
[11]	valid_0's auc: 0.978242
[12]	valid_0's auc: 0.979264
[13]	valid_0's auc: 0.980286
[14]	valid_0's auc: 0.98087
[15]	valid_0's auc: 0.979702
[16]	valid_0's auc: 0.98087
[17]	valid_0's auc: 0.981893
[18]	valid_0's auc: 0.982623
[19]	valid_0's auc: 0.982039
[20]	valid_0's auc: 0.983645
[21]	valid_0's auc: 0.984375
[22]	valid_0's auc: 0.983937
[23]	valid_0's auc: 0.984521
[24]	valid_0's auc: 0.985397
[25]	valid_0's auc: 0.985981
[26]	valid_0's auc: 0.986711
[27]	valid_0's auc: 0.987442
[28]	valid_0's auc: 0.987442
[29]	valid_0's auc: 0.987442
[30]	valid_0's auc: 0.987588
[31]	valid_0's auc: 0.988464
[32]	valid_0's auc: 0.989194
[33]	valid_0's auc: 0.98

In [410]:
predictions = [round(value) for value in y_pred]
acc = accuracy_score(y_test, predictions)
print("Accuracy: {:0.3f}".format(acc),'\n')
print('LightBoost Classifier \n',classification_report(y_test, predictions))

Accuracy: 0.959 

GradientBoost Classifier 
               precision    recall  f1-score   support

           0       0.94      0.95      0.95        64
           1       0.97      0.96      0.97       107

    accuracy                           0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [413]:
from catboost import CatBoostClassifier

cbc = CatBoostClassifier(iterations=2,
                          learning_rate=0.1,
                          depth=2)

In [414]:
cbc.fit(X_train, y_train)
y_pred = cbc.predict(X_test)
predictions = [round(value) for value in y_pred]

# Get the accuracy score
acc = accuracy_score(y_test, predictions)
print("Accuracy: {:0.3f}".format(acc),'\n')
print('CatBoost Regressor \n',classification_report(y_test, predictions))

0:	learn: 0.5842142	total: 87ms	remaining: 87ms
1:	learn: 0.4737043	total: 96.5ms	remaining: 0us
Accuracy: 0.918 

CatBoost Regressor 
               precision    recall  f1-score   support

           0       0.93      0.84      0.89        64
           1       0.91      0.96      0.94       107

    accuracy                           0.92       171
   macro avg       0.92      0.90      0.91       171
weighted avg       0.92      0.92      0.92       171



### Create XGBoost, lightgbm, catboost for iris Dataset

## 5. Stacking  

In [415]:
# from mlxtend.classifier import StackingClassifier => ini mesti install dulu 
from mlxtend.classifier import StackingClassifier

In [None]:
# Instantiate the 1st-layer classifiers
# clf1 = Classifier1(params1)
# clf2 = Classifier2(params2)
# ...
# clfN = ClassifierN(paramsN)

# Instantiate the 2nd-layer classifier
# clf_meta = ClassifierMeta(paramsMeta)

# Build the Stacking classifier
# clf_stack = StackingClassifier(
# classifiers=[clf1, clf2, ... clfN],
# meta_classifier=clf_meta,
# use_probas=False,
# use_features_in_secondary=False)

# Use the fit and predict methods
# like with scikit-learn estimators
# clf_stack.fit(X_train, y_train)
# pred = clf_stack.predict(X_test)

In [435]:
clf1 = KNeighborsClassifier(5)
clf2 = DecisionTreeClassifier()
clf3 = RandomForestClassifier()
clf4 = VotingClassifier(estimators=[('knn', clf_knn),('dt', clf_dt),('lr', clf_lr)])
clf5 = VotingClassifier(estimators=[('knn', clf_knn),
                                          ('dt', clf_dt),
                                          ('lr', clf_lr)],
                        voting='soft',
                        weights=[1,1,2])
clf6 = BaggingClassifier(RandomForestClassifier(),
                        max_samples=0.5, max_features=0.5)
clf7 = AdaBoostClassifier(DecisionTreeClassifier(),
                          n_estimators=100,
                          learning_rate=0.2)
clf8 = GradientBoostingClassifier()
clf9 = XGBClassifier()
clf10 = CatBoostClassifier(iterations=2,
                          learning_rate=0.1,
                          depth=2)

clf_meta = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10], 
                          meta_classifier=clf_meta)

In [436]:
sclf.fit(X_train, y_train)
y_pred = sclf.predict(X_test)
predictions = [round(value) for value in y_pred]

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, sclf], 
                      ['KNN', 
                       'Decision Tree', 
                       'Random Forest',
                       'Voting Classifier',
                       'Averaging Claasifier',
                       'Bagging Classifier',
                       'AdaBoost Classifier',
                       'Gradient Boosting Classifier',
                       'XGBoost Classifier',
                       'CatBoost Classifier',
                       'StackingClassifier']):

    scores = cross_val_score(clf, X, y, 
                                              cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

# Get the accuracy score
acc = accuracy_score(y_test, predictions)
print("Accuracy: {:0.3f}".format(acc),'\n')
print('CatBoost Regressor \n',classification_report(y_test, predictions))

0:	learn: 0.5842142	total: 34.1ms	remaining: 34.1ms
1:	learn: 0.4737043	total: 45.2ms	remaining: 0us
Accuracy: 0.92 (+/- 0.02) [KNN]
Accuracy: 0.90 (+/- 0.03) [Decision Tree]
Accuracy: 0.96 (+/- 0.02) [Random Forest]
Accuracy: 0.94 (+/- 0.02) [Voting Classifier]
Accuracy: 0.95 (+/- 0.02) [Averaging Claasifier]
Accuracy: 0.95 (+/- 0.01) [Bagging Classifier]
Accuracy: 0.89 (+/- 0.04) [AdaBoost Classifier]
Accuracy: 0.96 (+/- 0.01) [Gradient Boosting Classifier]
Accuracy: 0.97 (+/- 0.01) [XGBoost Classifier]
0:	learn: 0.5794877	total: 25.3ms	remaining: 25.3ms
1:	learn: 0.5008387	total: 31.5ms	remaining: 0us
0:	learn: 0.5893465	total: 4.11ms	remaining: 4.11ms
1:	learn: 0.5179855	total: 8.82ms	remaining: 0us
0:	learn: 0.5971567	total: 8.76ms	remaining: 8.76ms
1:	learn: 0.4836895	total: 13.3ms	remaining: 0us
Accuracy: 0.91 (+/- 0.03) [CatBoost Classifier]
0:	learn: 0.5794877	total: 18.2ms	remaining: 18.2ms
1:	learn: 0.5008387	total: 23.7ms	remaining: 0us
0:	learn: 0.5893465	total: 16.3ms	rem