## 1. Majority Voting

In [6]:
import pandas as pd

bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,PersonalLoan,SecuritiesAccount,CDAccount,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [7]:
X = bank_df.drop (['ID','ZIPCode','PersonalLoan'], axis=1)
y = bank_df['PersonalLoan']

In [8]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [9]:
from sklearn.tree import DecisionTreeClassifier     #결정 트리
from sklearn.neighbors import KNeighborsClassifier  #KNN
from sklearn.linear_model import LogisticRegression #로지스틱 회귀

tree = DecisionTreeClassifier(max_depth=None,
                              criterion='entropy',
                              random_state=1)

knn = KNeighborsClassifier(n_neighbors=1,
                            p=2,
                            metric='minkowski')

logistic = LogisticRegression(solver='liblinear',
                              penalty='l2',
                              C=0.001,
                              random_state=1)

In [10]:
from sklearn.ensemble import VotingClassifier #Majority Voting
voting_estimators = [('logistic', logistic), ('tree', tree), ('knn', knn)]
voting = VotingClassifier(estimators=voting_estimators, voting='soft')

In [11]:
from sklearn.model_selection import cross_val_score #교차검증

clf_labels = ['Logistic regression', 'Decision tree', 'KNN', 'Majority voting']
all_clf = [logistic, tree, knn, voting]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='roc_auc')
    print("ROC AUC: %0.3f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label))

ROC AUC: 0.928 (+/- 0.020) [Logistic regression]
ROC AUC: 0.950 (+/- 0.033) [Decision tree]
ROC AUC: 0.712 (+/- 0.047) [KNN]
ROC AUC: 0.972 (+/- 0.016) [Majority voting]


In [12]:
from sklearn.model_selection import GridSearchCV # 하이퍼파라미터 튜닝

params = {'logistic__C': [0.001, 0.1, 100.0],
          'tree__max_depth': [1, 3, 5],
          'knn__n_neighbors': [1, 3, 5]}

grid = GridSearchCV(estimator=voting,
                    param_grid=params,
                    cv=10,
                    scoring='roc_auc')
grid.fit(X_train, y_train)
   
print('최적의 파타미터: %s' % grid.best_params_)
print('ACU: %.3f' % grid.best_score_)    

최적의 파타미터: {'knn__n_neighbors': 3, 'logistic__C': 100.0, 'tree__max_depth': 5}
ACU: 0.986


## 2. 배깅(Bagging)

In [13]:
import pandas as pd

bank_df = pd.read_csv('UniversalBank.csv')
X = bank_df.drop (['ID','ZIPCode','PersonalLoan'], axis=1)
y = bank_df['PersonalLoan']

In [14]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [15]:
from sklearn.tree import DecisionTreeClassifier #결정 트리

tree = DecisionTreeClassifier(max_depth=None,
                              criterion='entropy',
                              random_state=1)

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=1)

In [17]:
from sklearn.model_selection import cross_val_score #교차타당도

clf_labels = ['Decision tree', 'Bagging', 'rf']
all_clf = [tree, rf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='roc_auc')
    print("ROC AUC: %0.3f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label))

ROC AUC: 0.950 (+/- 0.033) [Decision tree]
ROC AUC: 0.997 (+/- 0.002) [Bagging]


## 3. 아다부스트(AdaBoost)

In [18]:
import pandas as pd

bank_df = pd.read_csv('UniversalBank.csv')
X = bank_df.drop (['ID','ZIPCode','PersonalLoan'], axis=1)
y = bank_df['PersonalLoan']

In [19]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [20]:
from sklearn.tree import DecisionTreeClassifier #결정 트리

tree = DecisionTreeClassifier(max_depth=1,
                              criterion='entropy',
                              random_state=1)

In [21]:
from sklearn.ensemble import AdaBoostClassifier #부스팅(Boosting) 

adaboost = AdaBoostClassifier(estimator=tree, 
                              n_estimators=500,
                              learning_rate = 0.1, 
                              random_state=1)

In [22]:
from sklearn.model_selection import cross_val_score #교차타당도

clf_labels = ['Decision tree', 'Ada boost']
all_clf = [tree, adaboost]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='roc_auc')
    print("ROC AUC: %0.3f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label))

ROC AUC: 0.883 (+/- 0.023) [Decision tree]
ROC AUC: 0.984 (+/- 0.011) [Ada boost]


## 4. GBM

In [23]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
X = dataset.data
Y = dataset.target

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=1 )

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=40)
clf.fit(X_train,Y_train);

In [26]:
from sklearn.metrics import accuracy_score
Y_train_pred = clf.predict(X_train)
Y_test_pred = clf.predict(X_test)

print('학습데이터 정확도:', accuracy_score(Y_train, Y_train_pred))
print('시험데이터 정확도:', accuracy_score(Y_test, Y_test_pred))

학습데이터 정확도: 1.0
시험데이터 정확도: 0.9736842105263158


## 5. LightGBM

In [27]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
X = dataset.data
Y = dataset.target

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=1 )

In [29]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier(n_estimators=40)
clf.fit(X_train,Y_train);

In [30]:
from sklearn.metrics import accuracy_score
Y_train_pred = clf.predict(X_train)
Y_test_pred = clf.predict(X_test)

print('학습데이터 정확도:', accuracy_score(Y_train, Y_train_pred))
print('시험데이터 정확도:', accuracy_score(Y_test, Y_test_pred))

학습데이터 정확도: 0.9978021978021978
시험데이터 정확도: 0.9649122807017544
