In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:

###### Set Up #####
# verify our folder with the data and module assets is installed
# if it is installed make sure it is the latest
!test -e ds-assets && cd ds-assets && git pull && cd ..
# if it is not installed clone it
!test ! -e ds-assets && git clone https://github.com/IndraniMandal/ds-assets.git
# point to the folder with the assets
home = "ds-assets/assets/"
import sys
sys.path.append(home)

Cloning into 'ds-assets'...
remote: Enumerating objects: 205, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 205 (delta 54), reused 50 (delta 50), pack-reused 147 (from 1)[K
Receiving objects: 100% (205/205), 12.58 MiB | 4.92 MiB/s, done.
Resolving deltas: 100% (80/80), done.


In [3]:
!ls ds-assets/assets


 2fold-xval.png		      mammals-missing.csv
 5fold-xval.png		     'messy_covid19_southamerica - covid19_southamerica.csv'
 abalone.csv		      mlp.py
 bootstrap.py		      mlp_regression2.py
 caesarian.csv		      mlp_regression.py
 candy-data.csv		      model-performance-curves.png
 cars.csv		      newsgroups.csv
 classification1.jpg	      newsgroups-noheaders.csv
 classification2.jpg	     'Pancreatic Cancer 2020.csv'
 classification3.jpg	      PandasPythonForDataScience.jpg
 colab-badge.afdesign	      PandasPythonForDataScience.pdf
 colab-icon.afdesign	      pdf-badge.png
 colab-icon.png		      perceptron-eq.jpg
 confint.py		      perceptron.jpg
 confusion1.png		      perceptron.r
 confusion2.png		      perceptron-search.png
 crohnd.csv		      perceptron-train.jpg
 cross-validated-curve.png    pipeline.png
 data-science.jpg	      regression1.jpg
 diamonds.csv		      rs.png
 divorce.csv		      shuttle.csv
 divorce-readme.txt	      shuttle.pdf
 elbow.py		      sobar-72.csv
 gapminder_all.c

In [4]:
# compute 95% confidence intervals for classification and regression
# problems

def classification_confint(acc, n):
    '''
    Compute the 95% confidence interval for a classification problem.
      acc -- classification accuracy
      n   -- number of observations used to compute the accuracy
    Returns a tuple (lb,ub)
    '''
    import math
    interval = 1.96*math.sqrt(acc*(1-acc)/n)
    lb = max(0, acc - interval)
    ub = min(1.0, acc + interval)
    return (lb,ub)

In [5]:
import pandas
url = url = 'https://raw.githubusercontent.com/IndraniMandal/ds-assets/main/assets/crohnd.csv'
df_filled = pd.read_csv(url)
df_filled.head()

Unnamed: 0,ID,nrAdvE,BMI,height,country,sex,age,weight,treat
0,19908,4,25.22,163,0,0,47,67,placebo
1,19909,4,23.8,164,0,0,53,64,d1
2,19910,1,23.05,164,0,0,68,62,placebo
3,20908,1,25.71,165,0,0,48,70,d2
4,20909,2,25.95,170,0,0,67,75,placebo


In [6]:
X  = df_filled.drop(['ID', 'treat'],axis=1)
y = df_filled['treat']

print("Shape: {}".format(X.shape))

Shape: (117, 7)


In [7]:
model = MLPClassifier(hidden_layer_sizes=(14,), activation = 'relu', max_iter=10000, random_state=1)

model.fit(X, y)
predict_y = model.predict(X)
acc = accuracy_score(y, predict_y)
lb, ub = classification_confint(acc, X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f}, {:3.2f})".format(acc, lb, ub))

Accuracy: 0.33 (0.25, 0.42)


In [8]:
model = MLPClassifier(max_iter=10000, random_state=1)
param_grid = {
    'hidden_layer_sizes':
      [
      (10,), (20,),
      ],
    'activation' : ['logistic', 'relu']
}

grid = GridSearchCV(model, param_grid, cv=3)
grid.fit(X, y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

best_model = grid.best_estimator_
predict_y = best_model.predict(X)
acc = accuracy_score(y, predict_y)
lb,ub = classification_confint(acc,X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

Grid Search: best parameters: {'activation': 'logistic', 'hidden_layer_sizes': (20,)}
Accuracy: 0.91 (0.86,0.97)


In [9]:
model = MLPClassifier(max_iter=10000, random_state=1)
param_grid = {
    'hidden_layer_sizes':
      [
      (10,10), (20,10),
      ],
    'activation' : ['logistic', 'relu']
}

grid = GridSearchCV(model, param_grid, cv=3)
grid.fit(X, y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

best_model = grid.best_estimator_
predict_y = best_model.predict(X)
acc = accuracy_score(y, predict_y)
lb,ub = classification_confint(acc,X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

Grid Search: best parameters: {'activation': 'logistic', 'hidden_layer_sizes': (20, 10)}
Accuracy: 0.84 (0.77,0.90)
