In [1]:
# preamble to be able to run notebooks in Jupyter and Colab
try:
    from google.colab import drive
    import sys
    
    drive.mount('/content/drive')
    notes_home = "/content/drive/Shared drives/CSC310/notes/"
    user_home = "/content/drive/My Drive/"
    
    sys.path.insert(1,notes_home) # let the notebook access the notes folder

except ModuleNotFoundError:
    notes_home = "" # running native Jupyter environment -- notes home is the same as the notebook
    user_home = ""  # under Jupyter we assume the user directory is the same as the notebook

# ANN (MLP) Code Examples

In [2]:
# set up
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float_kind':"{:3.2f}".format})
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from assets.confint import classification_confint
from sklearn.metrics import confusion_matrix

# get data
df = pd.read_csv(notes_home+"assets/wdbc.csv")
df = df.drop(['ID'],axis=1)
X  = df.drop(['Diagnosis'],axis=1)
y = df['Diagnosis']


# neural network
# NOTE: a good rule of thumb for MLP starting point:
# a) 1 hidden layer
# b) the number of nodes in hidden layer: no_node = 2 * no_independents_vars
#
model = MLPClassifier(hidden_layer_sizes=(60,), activation='tanh', max_iter=1000, random_state=1)

# train and test the model
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=3)
model.fit(train_X, train_y)
predict_y = model.predict(test_X)
acc = accuracy_score(test_y, predict_y)
lb, ub = classification_confint(acc, test_X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f}, {:3.2f})".format(acc, lb, ub))


Accuracy: 0.92 (0.87, 0.97)


## MLP Grid Search

We can also perform a grid search to find the optimal network.

BEWARE: a grid search over all possible parameters of an MLP is almost impossible - combinatoric explosion, too many different combinations possible.

Here we only perform a grid over the number of nodes in a single hidden layer.



In [3]:
# set up
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float_kind':"{:3.2f}".format})
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from assets.confint import classification_confint

# get data
df = pd.read_csv(notes_home+"assets/wdbc.csv")
df = df.drop(['ID'],axis=1)
X  = df.drop(['Diagnosis'],axis=1)
actual_y = df['Diagnosis']

# neural network
model = MLPClassifier(max_iter=10000, random_state=1)

# grid search
param_grid = {'hidden_layer_sizes': [ (30,), (60,), (120,),
                                      (30,30), (30, 60), (30, 120),
                                      (60, 30), (60,60), (60, 120),
                                      (120, 30), (120, 60), (120, 120)
                                    ],
              'activation' : ['logistic', 'tanh', 'relu']
             }
grid = GridSearchCV(model, param_grid, cv=3)
grid.fit(X, actual_y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# evaluate the best model
best_model = grid.best_estimator_
predict_y = best_model.predict(X)
acc = accuracy_score(actual_y, predict_y)
lb,ub = classification_confint(acc,X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

# build the confusion matrix
labels = ['M', 'B']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Grid Search: best parameters: {'activation': 'logistic', 'hidden_layer_sizes': (30, 30)}
Accuracy: 0.96 (0.94,0.97)
Confusion Matrix:
     M    B
M  200   12
B   13  344


Notice that even though our first instinct is that the optimized MLP is much better than the straight forward MLP using our rule of thumb the difference in accuracy between these two models is statistically not significant because their confidence intervals overlap!

# Team Exercise

Use the Crohn’s Disease dataset: [crohnd](https://raw.githubusercontent.com/lutzhamel/ds/master/notes/assets/crohnd.csv)

Do the following:

* Build a 1-hidden-layer MLP according to our rule of thumb.
* Build a multi-layer MLP using grid-search search over layers and activation functions (see [MLP documentation](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)
* Build the best decision tree using grid-search for this data set

Evaluation:
* Which one of the models has the best accuracy?
* Are the differences in accuracy between the various models statistically significant?


# Teams

```
team 1:  Joshua D, Yeury, Zachary T
team 2:  Jake Adam, Joey, Cole
team 3:  Timothy, Cody Rithysan, Michael Russell
team 4:  Joshua Patrick, Korakot, Phidias
team 5:  Kenney A, Samantha N, Patrick M
team 6:  Jaeke R, Stephanie, Hennjer
team 7:  Timothy Terence, Camren Joseph, Emmely
team 8:  Sofia R, Julio, Luca G
team 9:  John Francis, Evan Jonathan, C.J.
team 10: Andrew Michael, William Jordan, Jared P
team 11: Ryan Richard, Giulia, Tony Levada
team 12: Shannon Patrice, Dan Steven
```