#KNN Model

In [1]:
# basic data routines
import pandas as pd
from google.colab import files
import io
#splitting the data
from sklearn.model_selection import KFold,train_test_split

# models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


# model evaluation routines
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [22]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:

###### Set Up #####
# verify our folder with the data and module assets is installed
# if it is installed make sure it is the latest
!test -e ds-assets && cd ds-assets && git pull && cd ..
# if it is not installed clone it
!test ! -e ds-assets && git clone https://github.com/IndraniMandal/ds-assets.git
# point to the folder with the assets
home = "ds-assets/assets/"
import sys
sys.path.append(home)

Cloning into 'ds-assets'...
remote: Enumerating objects: 205, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 205 (delta 54), reused 50 (delta 50), pack-reused 147 (from 1)[K
Receiving objects: 100% (205/205), 12.58 MiB | 4.00 MiB/s, done.
Resolving deltas: 100% (80/80), done.
Updating files: 100% (86/86), done.


In [3]:
!ls ds-assets/assets


 2fold-xval.png		      mammals-missing.csv
 5fold-xval.png		     'messy_covid19_southamerica - covid19_southamerica.csv'
 abalone.csv		      mlp.py
 bootstrap.py		      mlp_regression2.py
 caesarian.csv		      mlp_regression.py
 candy-data.csv		      model-performance-curves.png
 cars.csv		      newsgroups.csv
 classification1.jpg	      newsgroups-noheaders.csv
 classification2.jpg	     'Pancreatic Cancer 2020.csv'
 classification3.jpg	      PandasPythonForDataScience.jpg
 colab-badge.afdesign	      PandasPythonForDataScience.pdf
 colab-icon.afdesign	      pdf-badge.png
 colab-icon.png		      perceptron-eq.jpg
 confint.py		      perceptron.jpg
 confusion1.png		      perceptron.r
 confusion2.png		      perceptron-search.png
 crohnd.csv		      perceptron-train.jpg
 cross-validated-curve.png    pipeline.png
 data-science.jpg	      regression1.jpg
 diamonds.csv		      rs.png
 divorce.csv		      shuttle.csv
 divorce-readme.txt	      shuttle.pdf
 elbow.py		      sobar-72.csv
 gapminder_all.c

In [4]:
# compute 95% confidence intervals for classification and regression
# problems

def classification_confint(acc, n):
    '''
    Compute the 95% confidence interval for a classification problem.
      acc -- classification accuracy
      n   -- number of observations used to compute the accuracy
    Returns a tuple (lb,ub)
    '''
    import math
    interval = 1.96*math.sqrt(acc*(1-acc)/n)
    lb = max(0, acc - interval)
    ub = min(1.0, acc + interval)
    return (lb,ub)

In [5]:
import pandas
url = url = 'https://raw.githubusercontent.com/IndraniMandal/ds-assets/main/assets/crohnd.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,ID,nrAdvE,BMI,height,country,sex,age,weight,treat
0,19908,4,25.22,163,0,0,47,67,placebo
1,19909,4,23.8,164,0,0,53,64,d1
2,19910,1,23.05,164,0,0,68,62,placebo
3,20908,1,25.71,165,0,0,48,70,d2
4,20909,2,25.95,170,0,0,67,75,placebo


In [6]:
features = df.drop(['ID', 'treat'], axis = 1 )
target = df['treat']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, target, train_size=0.8, test_size=0.2, random_state=3)

In [8]:
df.shape

(117, 9)

In [9]:
model = KNeighborsClassifier(n_neighbors=3)

In [10]:
# some basic data stats
print("Shape: {}".format(df.shape))
print("Value Counts on the 'Treat' Field:")
print(df['treat'].value_counts())

Shape: (117, 9)
Value Counts on the 'Treat' Field:
treat
placebo    39
d1         39
d2         39
Name: count, dtype: int64


In [11]:
#train the model
model.fit(X_train, y_train)
predict_y = model.predict(X_test)

#test the model accuracy
acc = accuracy_score(y_test, predict_y)
lb, ub = classification_confint(acc, X_test.shape[0])
print("Accuracy: {:3.2f} ({:3.2f}, {:3.2f})".format(acc, lb, ub))

Accuracy: 0.38 (0.18, 0.57)


In [12]:
# KNN
model = KNeighborsClassifier()

# do the 5-fold cross validation and shuffle the data
cv = KFold(n_splits=5,  shuffle = True)

# grid search
param_grid = {'n_neighbors': list(range(1,26))}
grid = GridSearchCV(model, param_grid, cv=cv)

# performing grid search
grid.fit(X_train, y_train)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# accuracy of best model with confidence interval
pred_test = grid.best_estimator_.predict(X_test)
acc = accuracy_score(y_test, pred_test)
lb,ub = classification_confint(acc,X_test.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

Grid Search: best parameters: {'n_neighbors': 6}
Accuracy: 0.29 (0.11,0.47)


  _data = np.array(data, dtype=dtype, copy=copy,


In [13]:
# build the confusion matrix
labels = list(target.unique())
cm = confusion_matrix(y_true= y_test,y_pred= pred_test, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Confusion Matrix:
         placebo  d1  d2
placebo        2   4   2
d1             2   3   3
d2             4   2   2


In [14]:
y_test.value_counts()

Unnamed: 0_level_0,count
treat,Unnamed: 1_level_1
d2,8
d1,8
placebo,8


The best k-NN model has an accuracy of 38% with a confidence interval of (18%, 57%)

In [16]:
# decision trees
model = DecisionTreeClassifier()

# grid search
param_grid = {'max_depth': list(range(1,21)), 'criterion': ['entropy','gini'] }
grid = GridSearchCV(model, param_grid, cv=cv)

# performing grid search
grid.fit(X_train, y_train)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# accuracy of best model with confidence interval
pred_test = grid.best_estimator_.predict(X_test)
acc = accuracy_score(y_test, pred_test)
lb,ub = classification_confint(acc,X_test.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

# build the confusion matrix
labels = list(target.unique())
cm = confusion_matrix(y_true= y_test,y_pred= pred_test, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Grid Search: best parameters: {'criterion': 'entropy', 'max_depth': 13}
Accuracy: 0.25 (0.08,0.42)
Confusion Matrix:
         placebo  d1  d2
placebo        2   4   2
d1             5   1   2
d2             3   2   3


  _data = np.array(data, dtype=dtype, copy=copy,


In [17]:
confusion_matrix(y_true= y_test,y_pred= pred_test)


array([[1, 2, 5],
       [2, 3, 3],
       [4, 2, 2]])

I had to do mine like this due to my confusion matrix being a 3x3 rather than a 2x2.

In [49]:
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix

cm_multi = multilabel_confusion_matrix(y_true=y_test, y_pred=pred_test)

#accessing the confusion matrices for each class(placebo, d1, d2)
for i, cm in enumerate(cm_multi):
    print(f"Confusion matrix for class {i}:")
    print(cm)
    tn, fp, fn, tp = cm.ravel()
    print(f"TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")
    print(" ")

Confusion matrix for class 0:
[[10  6]
 [ 7  1]]
TN: 10, FP: 6, FN: 7, TP: 1
 
Confusion matrix for class 1:
[[12  4]
 [ 5  3]]
TN: 12, FP: 4, FN: 5, TP: 3
 
Confusion matrix for class 2:
[[8 8]
 [6 2]]
TN: 8, FP: 8, FN: 6, TP: 2
 


In [35]:
from sklearn import tree # Import the tree submodule from sklearn

model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=1)

model.fit(X_train, y_train)

y_train_model = model.predict(X_train)

y_test_model = model.predict(X_test)

#MLP Model

In [23]:
import pandas
url = url = 'https://raw.githubusercontent.com/IndraniMandal/ds-assets/main/assets/crohnd.csv'
df_filled = pd.read_csv(url)
df_filled.head()

Unnamed: 0,ID,nrAdvE,BMI,height,country,sex,age,weight,treat
0,19908,4,25.22,163,0,0,47,67,placebo
1,19909,4,23.8,164,0,0,53,64,d1
2,19910,1,23.05,164,0,0,68,62,placebo
3,20908,1,25.71,165,0,0,48,70,d2
4,20909,2,25.95,170,0,0,67,75,placebo


In [24]:
X  = df_filled.drop(['ID', 'treat'],axis=1)
y = df_filled['treat']

print("Shape: {}".format(X.shape))

Shape: (117, 7)


Relu Activation Function

In [25]:
model = MLPClassifier(hidden_layer_sizes=(14,), activation = 'relu', max_iter=10000, random_state=1)

model.fit(X, y)
predict_y = model.predict(X)
acc = accuracy_score(y, predict_y)
lb, ub = classification_confint(acc, X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f}, {:3.2f})".format(acc, lb, ub))

Accuracy: 0.33 (0.25, 0.42)


Logistic Activation function

In [50]:
model = MLPClassifier(hidden_layer_sizes=(14,), activation = 'logistic', max_iter=10000, random_state=1)

model.fit(X, y)
predict_y = model.predict(X)
acc = accuracy_score(y, predict_y)
lb, ub = classification_confint(acc, X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f}, {:3.2f})".format(acc, lb, ub))

Accuracy: 0.80 (0.73, 0.88)


In [38]:
model = MLPClassifier(max_iter=10000, random_state=1)
param_grid = {
    'hidden_layer_sizes':
      [
      (10,), (20,),
      ],
    'activation' : ['relu', 'logistic']
}

grid = GridSearchCV(model, param_grid, cv=3)
grid.fit(X, y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

best_model = grid.best_estimator_
predict_y = best_model.predict(X)
acc = accuracy_score(y, predict_y)
lb,ub = classification_confint(acc,X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

Grid Search: best parameters: {'activation': 'logistic', 'hidden_layer_sizes': (20,)}
Accuracy: 0.91 (0.86,0.97)


In [39]:
model = MLPClassifier(max_iter=10000, random_state=1)
param_grid = {
    'hidden_layer_sizes':
      [
      (10,10), (20,10),
      ],
    'activation' : ['relu', 'logistic']
}

grid = GridSearchCV(model, param_grid, cv=3)
grid.fit(X, y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

best_model = grid.best_estimator_
predict_y = best_model.predict(X)
acc = accuracy_score(y, predict_y)
lb,ub = classification_confint(acc,X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

Grid Search: best parameters: {'activation': 'logistic', 'hidden_layer_sizes': (20, 10)}
Accuracy: 0.84 (0.77,0.90)


#Answer to the HW

Task: Report if the difference between the models is statistically significant (hint: confidence intervals)

Answer: The difference in performance between the models are statistically significant. My best MLP model has a higher accuracy of 91% with confidence intervals(86%, 97%) as compared to my best KNN model with an accuracy of 29% and 95% confidence intervals(11%, 47%). The decision tree model had an accuracy of 25% with the 95% confidence intervals of (8%, 42%).

In summary, the lack of overlap confirms that the MLP model outperfoms both the KNN and Decision tree model by a huge margin and the difference is statistically significant. Therefor the MLP model would be the best model to select.

Task: Be sure to include a paragraph about my teamwork!

Answer: For this homework, working independently allowed me to develop a thorough understanding of each step of the machine learning workflow. From data preparation to model selection, tuning, and evaluation, I handled every component efficiently. This required careful attention to detail, especially when performing tasks like hyperparameter tuning and interpreting statistical significance through confidence intervals. I was also able to strengthen my problem-solving skills, as I had to troubleshoot and resolve issues along the way such as dealing with the 3x3 matrix rather than a 2x2 matrix. Although facing some challenges at times, this experience was highly rewarding and reinforced my ability to manage projects end-to-end with precision and insight.