In [1]:
'''
preprocessing
'''
import utils
from utils import *
importlib.reload(utils)
from utils import *

Using TensorFlow backend.


In [2]:
'''
load data
'''
X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])
xtrain = X  
ytrain = y

# NN1 model 

In [3]:
class NN1():
    def __init__(self):
        self.mlp = KerasClassifier(build_fn=self.create_model, 
                                   epochs=15, batch_size=35, 
                                   verbose=1)
        return
    
    def score(self, X_, y_):
        pred = self.predict(X_)
        if y_.shape[1] < 2:
            y_normal = y_
        else:
            y_normal = np.argmax(y_, axis=1)
        BMAC = balanced_accuracy_score(y_normal, pred)
        return BMAC
    
    def fit(self, X_, y_):
        # One hot encode data
        y_enc = np.zeros((y_.shape[0], 3))
        y_enc[np.arange(y_.shape[0]), y_] = 1
        
        xscaled = preprocessing.StandardScaler().fit_transform(X_)
        cw = sklearn.utils.class_weight.compute_class_weight('balanced', np.unique(y_), y_)
        class_weight = {0: cw[0], 1: cw[1], 2: cw[2]}
        self.mlp.fit(xscaled, y_enc, class_weight=class_weight)
        return
    
    def predict(self, X_):
        return self.mlp.predict(preprocessing.StandardScaler().fit_transform(X_))
    
    def create_model(self):
        # create model
        neurons = 1000
        dropout_rate = 0.5
        weight_constraint = 4
        model = Sequential()
        model.add(Dense(neurons, input_dim=1000, activation='relu', kernel_constraint=maxnorm(weight_constraint)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(neurons, activation='relu', kernel_constraint=maxnorm(weight_constraint)))
        model.add(Dropout(dropout_rate*0.4))
        model.add(Dense(int(neurons/2), activation='relu', kernel_constraint=maxnorm(weight_constraint)))
        model.add(Dropout(dropout_rate*0.2))
        model.add(Dense(3, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def predict_proba(self, X_):
        return self.mlp.predict_proba(preprocessing.StandardScaler().fit_transform(X_))

# nn = NN1()
# nn.fit(X[1:10], y[1:10])
# ypredy = nn.predict(X[1:10])
# probbb = nn.predict_proba(X[1:10])



# Ensemble 

In [None]:
# Define the ensemble voting classifier fn:

def ensemby(y1_prob, y2_prob, voting='soft'):
    # Return the argmax of the sum of the probabilities
    return np.argmax(y1_prob + y2_prob, axis=1)

# Prepare the SVM
orig_svm = svm.SVC(C=0.5, class_weight='balanced', degree=1, gamma='auto', kernel='rbf', probability=True)
steps = [("scaler", preprocessing.StandardScaler()), ("classifier", orig_svm)]
svm_pipeline = Pipeline(steps = steps)

# Prepare the NN1
nn = NN1()

# Models to fit
print("Fitting SVM...")
svm_pipeline.fit(x_train, y_train.ravel())
print("Fitting NN...")
nn.fit(x_train, y_train.ravel())

# Predict and join the predictions
svm_pred = svm_pipeline.predict(x_test)
nn_pred = nn.predict(x_test)
svm_prob = svm_pipeline.predict_proba(x_test)
nn_prob = nn.predict_proba(x_test)
ensemble_pred = ensemby(svm_prob, nn_prob)

# Record scores
BMAC_ensemble = balanced_accuracy_score(y_test, ensemble_pred)
BMAC_svm = balanced_accuracy_score(y_test, svm_pred)
BMAC_nn = balanced_accuracy_score(y_test, nn_pred)

In [None]:
plt.scatter(range(0, len(svm_pred)), svm_pred)
plt.scatter(range(0, len(svm_pred)), nn_pred)
plt.scatter(range(0, len(svm_pred)), ensemble_pred)

# Find where ensemble pred is diff from theirs:
# Case 1: When both predictions are same, does the ensemble ever get it wrong?
((svm_pred == nn_pred) & (svm_pred != ensemble_pred)).nonzero()[0]
# ^ This is nonempty, so the answer is yes. 

# Check that the probability thing is acting correctly for svm
idx = (np.argmax(svm_prob, axis=1) != svm_pred).nonzero()[0]
# Yes, there are times where it guesses wrong based on probability! so svm_pred is not equal to svm_prob?
print(np.argmax((svm_prob[idx][0:5]), axis=1))
print(svm_prob[idx][0:5])
print(svm_pred[idx][0:5])

# Try to remove standardization:
svm_prob2 = orig_svm.predict_proba(x_test)
svm_pred2 = orig_svm.predict(x_test)
idx = (np.argmax(svm_prob2, axis=1) != svm_pred).nonzero()[0]
# Yes, there are times where it guesses wrong based on probability! so svm_pred is not equal to svm_prob?
print(np.argmax((svm_prob2[idx][0:5]), axis=1))
print(svm_prob2[idx][0:5])
print(svm_pred[idx][0:5])


# Check that the probability thing is acting correctly for nn
# idx = (np.argmax(nn_prob, axis=1) != nn_pred).nonzero()[0]
# print("Idx nn:", idx)
# The above is empty, so nn is working fine


# Case 2: When both predictions are different, does the ensemble always 
# ((svm_pred != nn_pred) & (ensemble_pred != nn_pred)).nonzero()[0]

In [8]:
x_test2 = X[201:300]
x_train2 = X[1:100]
y_train2 = y[1:100]

# Prepare the SVM
orig_svm = svm.SVC(C=0.5, class_weight='balanced', degree=1, gamma='auto', kernel='rbf', probability=True)
steps = [("scaler", preprocessing.StandardScaler()), ("classifier", orig_svm)]
svm_pipeline = Pipeline(steps = steps)

print("Fitting SVM...")
svm_pipeline.fit(x_train2, y_train2.ravel())

# Predict and join the predictions
svm_pred = svm_pipeline.predict(x_test2)
svm_prob = svm_pipeline.predict_proba(x_test2)

# VERSUS:

# Prepare the SVM
svm2 = svm.SVC(C=0.5, class_weight='balanced', degree=1, gamma='auto', kernel='rbf', probability=True)

print("Fitting SVM...")
svm2.fit(x_train2, y_train2.ravel())

# Predict and join the predictions
svm2_pred = svm2.predict(x_test2)
svm2_prob = svm2.predict_proba(x_test2)

# Test whether pipeline pred is consistent with prob:
idx = (np.argmax(svm_prob, axis=1) != svm_pred).nonzero()[0]
print("idx 1:", idx)
# Answer: no, cus idx non empty

# Test whether isolated pred is consistent with prob:
idx = (np.argmax(svm2_prob, axis=1) != svm2_pred).nonzero()[0]
print("idx 2:", idx)
# Answer: no, cus idx non empty

Fitting SVM...
Fitting SVM...
idx 1: [ 0  3  4  5  8  9 15 19 22 25 26 29 31 34 37 39 43 44 46 49 54 56 57 58
 65 67 68 69 73 77 82 84 90 92 94]
idx 2: [ 0  3  4  5  8  9 13 15 19 22 25 26 29 31 33 34 37 39 40 41 43 44 46 47
 49 52 53 54 55 56 57 58 65 67 68 69 70 73 75 77 80 82 84 85 90 92 94]


In [None]:
# Try a nn Estimator with SMOTE

kf = KFold(n_splits=5, shuffle = True)

BMAC_means = np.array([])
BMAC_stds = np.array([])
BMAC_scores = np.array([])
svm_scores = np.array([])
nn_scores = np.array([])
for train_index, test_index in kf.split(xtrain):

    # Prepare the data
    x_train = xtrain[train_index]
    x_test = xtrain[test_index]
    y_train = ytrain[train_index]
    y_test = ytrain[test_index]

    # Prepare the SVM
    orig_svm = svm.SVC(C=0.5, class_weight='balanced', degree=1, gamma='auto', kernel='rbf', probability=True)
    steps = [("scaler", preprocessing.StandardScaler()), ("classifier", orig_svm)]
    svm_pipeline = Pipeline(steps = steps)
    
    # Prepare the NN1
    nn = NN1()
    
    # Models to fit
    print("Fitting SVM...")
    svm_pipeline.fit(x_train, y_train.ravel())
    print("Fitting NN...")
    nn.fit(x_train, y_train.ravel())
    
    # Predict and join the predictions
    svm_pred = svm_pipeline.predict(x_test)
    nn_pred = nn.predict(x_test)
    svm_prob = svm_pipeline.predict_proba(x_test)
    nn_prob = nn.predict_proba(x_test)
    ensemble_pred = ensemby(svm_prob, nn_prob)
    
    # Record scores
    BMAC_ensemble = balanced_accuracy_score(y_test, ensemble_pred)
    BMAC_svm = balanced_accuracy_score(y_test, svm_pred)
    BMAC_nn = balanced_accuracy_score(y_test, nn_pred)
    print("BMAC Ensemble Scores: ", BMAC_ensemble)
    print("BMAC SVM Scores: ", BMAC_svm)
    print("BMAC NN Scores: ", BMAC_nn)
    BMAC_scores = np.append(BMAC_scores, BMAC_ensemble)
    svm_scores = np.append(svm_scores, BMAC_svm)
    nn_scores = np.append(nn_scores, BMAC_nn)
    
BMAC_means = np.append(BMAC_means, np.mean(BMAC_scores))
BMAC_stds = np.append(BMAC_stds, np.std(BMAC_scores))

print("Scores:", BMAC_scores)
print("SVM Scores:", svm_scores)
print("NN Scores:", nn_scores)
print("Mean Scores:", BMAC_means)
print("Std Scores:", BMAC_stds)