In [1]:
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline 
%load_ext memory_profiler
from sklearn.metrics import make_scorer
from scipy.special import expit
import time
import math
import random
from memory_profiler import memory_usage
from sklearn import metrics as mt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold



from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

target_classifier = 'PC'
df = pd.read_csv('responses.csv', sep=",")

In [2]:
# remove rows whose target classfier value is NaN
df_cleaned_classifier = df[np.isfinite(df[target_classifier])]
# change NaN number values to the mean
df_imputed = df_cleaned_classifier.fillna(df.mean())
# get categorical features
object_features = list(df_cleaned_classifier.select_dtypes(include=['object']).columns)
# one hot encode categorical features
one_hot_df = pd.concat([pd.get_dummies(df_imputed[col],prefix=col) for col in object_features], axis=1)
# drop object features from imputed dataframe
df_imputed_dropped = df_imputed.drop(object_features, 1)
frames = [df_imputed_dropped, one_hot_df]
# concatenate both frames by columns
df_fixed = pd.concat(frames, axis=1)

In [3]:
# Research on Cost Matrix
# http://www.ibm.com/support/knowledgecenter/SSEPGG_11.1.0/com.ibm.im.model.doc/c_cost_matrix.html

cost_matrix = np.matrix([[0,1,2,3,4],
[1,0,1,2,3],
[3,1,0,1,2],
[5,3,1,0,1],
[7,5,2,1,0]])

def get_confusion_costTot(confusion_matrix, cost_matrix):
    score = np.sum(confusion_matrix*cost_matrix)
    return score

confusion_scorer = make_scorer(get_confusion_costTot, greater_is_better=False)
confusion_scorer

make_scorer(get_confusion_costTot, greater_is_better=False)

In [4]:
from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if target_classifier in df_fixed:
    y = df_fixed[target_classifier].values # get the labels we want
    del df_fixed[target_classifier] # get rid of the class label
    X = df_fixed.values # use everything else to predict!

X = X/5
num_folds = 10

cv_object = StratifiedKFold(n_splits= num_folds, random_state=None, shuffle=True)
cv_object.split(X,y)

print(cv_object)

StratifiedKFold(n_splits=10, random_state=None, shuffle=True)


In [5]:
for train_indices, test_indices in cv_object.split(X,y): 
        # I will create new variables here so that it is more obvious what 
        # the code is doing (you can compact this syntax and avoid duplicating memory,
        # but it makes this code less readable)
        X_train = (X[train_indices])
        y_train = y[train_indices]

    #     print(X_train)
    #     print(y_train)

        X_test = (X[test_indices])
        y_test = y[test_indices]

In [6]:
clf = MLPClassifier()

In [7]:
from sklearn.ensemble import BaggingClassifier
from statistics import mode

class MyEnsemble():
    
    def __init__(self, c, num_c, max_s, v):
        self.Ensemble = BaggingClassifier(base_estimator= c,
                                    n_estimators = num_c,
                                     max_samples = max_s,
                                     verbose = v)
    def predict(self, X):
        return self.Ensemble.predict(X)
    
    def fit(self, X,y):
        self.Ensemble.fit(X,y)
        
    def fit_random(self, X, y):
        self.fit(X,y) # just to get the ensemble estimators initialized
        for classifier in self.Ensemble.estimators_:
            row_indexes = np.random.randint(X.shape[0], size=self.Ensemble.max_samples) # gets row indexes
            column_indexes = np.random.randint(X.shape[1], size=math.sqrt(X.shape[1])) #gets column indexes
            classifier.fit(X[np.ix_(row_indexes,column_indexes)], y[row_indexes])
            
    def predict_random(self, x):
        predictions = []
        for classifier in self.Ensemble.estimators_:
            predictions.append[classifier.predict(x)]
        return(mode(predictions))
            
    
    def predict_proba(self, X):
        return self.Ensemble.predict_proba(X)

In [10]:
num_instances = 10


ensemble = MyEnsemble(clf, 10,y_train.shape[0],False)

ensemble.fit(X_train,y_train)
y_hat=ensemble.predict(X_test[1])
# print(y_hat)
print(mt.confusion_matrix(y_hat,y_test))

print(X_train.shape)



ValueError: Found input variables with inconsistent numbers of samples: [1, 98]

In [11]:
ensemble.predict_proba(X_test)

array([[ 0.00791057,  0.07464913,  0.08540451,  0.57106081,  0.26097498],
       [ 0.17690908,  0.09388587,  0.47283788,  0.22607983,  0.03028734],
       [ 0.03334851,  0.13876676,  0.10925232,  0.20273776,  0.51589465],
       [ 0.0070164 ,  0.03832066,  0.09122405,  0.53251585,  0.33092305],
       [ 0.11645371,  0.15774367,  0.19207562,  0.46976262,  0.06396439],
       [ 0.05869263,  0.1480663 ,  0.07848616,  0.15553635,  0.55921856],
       [ 0.02600128,  0.0852406 ,  0.48533033,  0.26535816,  0.13806963],
       [ 0.07152794,  0.56511451,  0.14878331,  0.14993491,  0.06463934],
       [ 0.00339992,  0.02214786,  0.05431817,  0.20191905,  0.71821501],
       [ 0.31253056,  0.30031308,  0.31707738,  0.04562797,  0.02445102],
       [ 0.18726383,  0.3530118 ,  0.32230404,  0.09990162,  0.03751871],
       [ 0.05014356,  0.05317366,  0.28557197,  0.22075782,  0.39035299],
       [ 0.1813485 ,  0.09721054,  0.24648868,  0.37570903,  0.09924324],
       [ 0.06814493,  0.48527787,  0.3

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
 acc = mt.accuracy_score(y_test,y_hat)
            #         lr_clf_accuracies.append(acc)
            #         cost_accuracies.append([acc])

conf = mt.confusion_matrix(y_test,y_hat)
plot_confusion_matrix(conf, classes=[0, 1, 2, 3, 4])

In [None]:
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
plot_confusion_matrix(mt.confusion_matrix(y_test, y_hat), classes=[0, 1, 2, 3, 4])

In [None]:
score = get_confusion_costTot(conf, cost_matrix)

In [None]:
score

### Exceptional Work

#### Proof showing dependence of classifiers

* Collecting the average base classifier error rate

In [21]:
from sklearn.metrics import accuracy_score

sum_error = 0
for classifiers in ensemble.Ensemble.estimators_:
    y_hat = classifiers.predict(X_test)
    sum_error += (1 - accuracy_score(y_test, y_hat))
error_rate = sum_error/20
print(error_rate)

0.390816326531


* Collecting the ensemble error rate

In [22]:
ensemble.fit(X_train, y_train)
y_hat = ensemble.predict(X_test)
1 - accuracy_score(y_test, y_hat)

0.62244897959183676

* Collecting the theoretical ensemble error rate assuming classifiers are independent

In [23]:
from scipy.misc import comb
import math
def ensemble_error(n_classifier, error):
    k_start = math.ceil(n_classifier/2.0)
    probs = [comb(n_classifier, k) *
            error**k *
            (1-error)**(n_classifier - k) for k in range (k_start, n_classifier + 1)]
    return(sum(probs))

In [24]:
ensemble_error(n_classifier=20, error=error_rate)


0.21850639961887866

The probability of the theoretical ensemble error rate assuming classifiers are independent is significantly different than the probability of our actual ensemble error rate. So we know that the classifiers are not independent, meaning that the errors that the models make overlap.

Logic as follows:

#Assume classifiers are independent

    if classifiers are independent, then ensemble error rate is k
    
    ensemble error rate is > k
    
    ensemble error rate is not k
    
    PROOF BY CONTRADICTION

<b>Classifiers are not independent.</b>