In [44]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [45]:
iris = datasets.load_iris()

In [46]:
X,y = iris.data[50:, [1,2]], iris.target[50:]

In [47]:
le = LabelEncoder()
y = le.fit_transform(y)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.5,random_state=1,stratify=y)

In [49]:
clf1 = LogisticRegression(C=0.001, penalty='l2', solver='lbfgs', random_state=1)

In [50]:
clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=1)

In [51]:
clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')

In [52]:
pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])

In [53]:
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])

In [54]:
clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

In [55]:
print('10-fold cross validation:\n')
for clf, label in zip([pipe1,clf2,pipe3],clf_labels):
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

10-fold cross validation:

ROC AUC: 0.92 (+/- 0.15) [Logistic Regression]
ROC AUC: 0.87 (+/- 0.18) [Decision Tree]
ROC AUC: 0.85 (+/- 0.13) [KNN]


In [56]:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.pipeline import _name_estimators

In [57]:
import operator

In [61]:
class MajorityVoteClassifier(BaseEstimator,ClassifierMixin):
    def __init__(self,classifiers,vote='classlabel',weights=None):
        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
    
    def fit(self,X,y):
        self.labelenc_ = LabelEncoder()
        self.labelenc_.fit(y)
        self.classes_ = self.labelenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X,self.labelenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self
    
    def predict(self,X):
        if (self.vote=='probability'):
            probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
            avg_probas = np.average(probas, axis=0, weights=self.weights)
            maj_vote = np.argmax(avg_probas, axis=1)
        else: #'classlabel'
            predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x,weights=self.weights)), axis=1, arr=predictions)
        maj_vote = self.labelenc_.inverse_transform(maj_vote)
    
    def predict_proba(self,X):
        probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
        avg_probas = np.average(probas, axis=0, weights=self.weights)
        return avg_probas    

In [62]:
mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
all_clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN', 'Majority Classifier']
print('10-fold cross validation:\n')
for clf, label in zip([pipe1,clf2,pipe3,mv_clf],all_clf_labels):
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

10-fold cross validation:

ROC AUC: 0.92 (+/- 0.15) [Logistic Regression]
ROC AUC: 0.87 (+/- 0.18) [Decision Tree]
ROC AUC: 0.85 (+/- 0.13) [KNN]
ROC AUC: 0.98 (+/- 0.05) [Majority Classifier]


In [67]:
from sklearn.metrics import accuracy_score
print('Accuracy:\n')
for clf, label in zip([pipe1,clf2,pipe3],clf_labels):
    clf_fit = clf.fit(X_train,y_train)
    y_train_pred = clf.predict(X_train)
    if (label == pipe1):
        X_test_std = pipe1['sc'].transform(X_test)
    elif (label == pipe3):
        X_test_std = pipe3['sc'].transform(X_test)
    else:
        X_test_std = X_test
    y_test_pred = clf.predict(X_test_std)
    acc_train = accuracy_score(y_train,y_train_pred)
    acc_test = accuracy_score(y_test,y_test_pred)
    print("Training accuracy score: %2.2f [%s]" % (acc_train,label))
    print("Testing accuracy score: %2.2f [%s]" % (acc_test,label))

Accuracy:

Training accuracy score: 0.86 [Logistic Regression]
Testing accuracy score: 0.84 [Logistic Regression]
Training accuracy score: 0.94 [Decision Tree]
Testing accuracy score: 0.90 [Decision Tree]
Training accuracy score: 1.00 [KNN]
Testing accuracy score: 0.86 [KNN]


In [70]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
bag = BaggingClassifier(base_estimator=clf2,n_estimators=100,max_samples=1,max_features=1,bootstrap=True,bootstrap_features=False,n_jobs=1,random_state=1)
bag.fit(X_train,y_train)
y_train_predict = bag.predict(X_train)
y_test_predict = bag.predict(X_test)
acc_train = accuracy_score(y_train,y_train_pred)
acc_test = accuracy_score(y_test,y_test_pred)
label = "Bag Classifier"
print("Training accuracy score: %2.2f [%s]" % (acc_train,label))
print("Testing accuracy score: %2.2f [%s]" % (acc_test,label))

Training accuracy score: 1.00 [Bag Classifier]
Testing accuracy score: 0.86 [Bag Classifier]


In [73]:
tree = DecisionTreeClassifier(criterion='gini',max_depth=1,random_state=1)
tree.fit(X_train,y_train)
y_train_predict = tree.predict(X_train)
y_test_predict = tree.predict(X_test)
acc_train = accuracy_score(y_train,y_train_pred)
acc_test = accuracy_score(y_test,y_test_pred)
label = "Stump Tree Classifier"
print("Training accuracy score: %2.2f [%s]" % (acc_train,label))
print("Testing accuracy score: %2.2f [%s]" % (acc_test,label))

Training accuracy score: 1.00 [Stump Tree Classifier]
Testing accuracy score: 0.86 [Stump Tree Classifier]


In [79]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=tree,n_estimators=100,learning_rate=0.1,random_state=1)
ada.fit(X_train,y_train)
y_train_predict = ada.predict(X_train)
y_test_predict = ada.predict(X_test)
acc_train = accuracy_score(y_train,y_train_pred)
acc_test = accuracy_score(y_test,y_test_pred)
label = "AdaBoost Classifier"
print("Training accuracy score: %2.2f [%s]" % (acc_train,label))
print("Testing accuracy score: %2.2f [%s]" % (acc_test,label))

Training accuracy score: 1.00 [AdaBoost Classifier]
Testing accuracy score: 0.86 [AdaBoost Classifier]


In [132]:
y_train_boost = np.where(y_train == 0,-1,1)
w = np.full((len(X_train)),1/len(X_train))
for i in range(100):
    tree.fit(X_train,y_train_boost,w)
    ypredict = tree.predict(X_train)
    arr1 = abs(ypredict - y_train_boost)/2
    error = np.dot(w,arr1.T)
    alpha = 0.5*np.log((1 - error)/error)
    arr2 = np.where(arr1 == 0,1,-1)
    w = w*np.exp(-alpha*arr2)
    w = w/np.sum(w)
    final_pred = final_pred + alpha*ypredict

In [133]:
final_pred_y = np.where(final_pred <0,0,1)