In [1]:
# preamble to be able to run notebooks in Jupyter and Colab
try:
    from google.colab import drive
    import sys
    
    drive.mount('/content/drive')
    notes_home = "/content/drive/Shared drives/CSC310/notes/"
    user_home = "/content/drive/My Drive/"
    
    sys.path.insert(1,notes_home) # let the notebook access the notes folder

except ModuleNotFoundError:
    notes_home = "" # running native Jupyter environment -- notes home is the same as the notebook
    user_home = ""  # under Jupyter we assume the user directory is the same as the notebook

# ANN (MLP) Code Examples

In [2]:
# set up
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float_kind':"{:3.2f}".format})
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# get data
df = pd.read_csv(notes_home+"assets/wdbc.csv")
df = df.drop(['ID'],axis=1)
X  = df.drop(['Diagnosis'],axis=1)
y = df['Diagnosis']


# neural network
model = MLPClassifier(hidden_layer_sizes=(60,30), max_iter=10000)

# do the 5-fold cross validation
scores = cross_val_score(model, X, y, cv=5)
print("Fold Accuracies: {}".format(scores))
print("Accuracy: {:3.2f}".format(scores.mean()))

Fold Accuracies: [0.87 0.91 0.91 0.96 0.88]
Accuracy: 0.91


## MLP Grid Search

We can also perform a grid search to find the optimal network.

BEWARE: a grid search over all possible parameters of an MLP is almost impossible - combinatoric explosion, too many different combinations possible.

Here we only perform a grid over the number of nodes in a single hidden layer.



In [3]:
# set up
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float_kind':"{:3.2f}".format})
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from assets.confint import classification_confint

# get data
df = pd.read_csv(notes_home+"assets/wdbc.csv")
df = df.drop(['ID'],axis=1)
X  = df.drop(['Diagnosis'],axis=1)
actual_y = df['Diagnosis']

# neural network
model = MLPClassifier(max_iter=10000)

# grid search
param_grid = {'hidden_layer_sizes': [ (5,30), (10,30), (20,30), (30,30), 
                                     (40,30), (50,30), (60,30), (70,30), 
                                     (80,30), (90,30), (100,30)]}
grid = GridSearchCV(model, param_grid, cv=5)
grid.fit(X, actual_y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# evaluate the best model
best_model = grid.best_estimator_
predict_y = best_model.predict(X)
acc = accuracy_score(actual_y, predict_y)
lb,ub = classification_confint(acc,X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

# build the confusion matrix
labels = ['M', 'B']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Grid Search: best parameters: {'hidden_layer_sizes': (20, 30)}
Accuracy: 0.93 (0.91,0.95)
Confusion Matrix:
     M    B
M  188   24
B   17  340


# Team Exercise

Use the Crohn’s Disease dataset: [CrohnD](https://vincentarelbundock.github.io/Rdatasets/datasets.html)

You will need to preprocess this before you can use it.  You will need to drop the 'ID' column and you will
need to rename the following values:

c1 -> 0, c2 -> 1, F -> 0, M -> 1

Build a ANN/MLP with the best cross-validated performance you can find.  Do a cross-validated grid search over the following:

* One layer MLP with i in [10,20] with parameter (i,)
* Two layer MLP with i in [10,20] with parameter (i,10)
* Different activation functions {'relu', 'logistic'} (see [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier).

Compare your best MLP to either a best tree or a best KNN (or both).

Report if the difference between the models is statistically significant (hint: confidence intervals)


# Teams

```
team 1:  Joshua D, Yeury, Zachary T
team 2:  Jake Adam, Joey, Cole
team 3:  Timothy, Cody Rithysan, Michael Russell
team 4:  Joshua Patrick, Korakot, Phidias
team 5:  Kenney A, Samantha N, Patrick M
team 6:  Jaeke R, Stephanie, Hennjer
team 7:  Timothy Terence, Camren Joseph, Emmely
team 8:  Sofia R, Julio, Luca G
team 9:  John Francis, Evan Jonathan, C.J.
team 10: Andrew Michael, William Jordan, Jared P
team 11: Ryan Richard, Giulia, Tony Levada
team 12: Shannon Patrice, Dan Steven
```