In [54]:
import pandas as pd #data analysis library
import matplotlib.pyplot as plt #graphing
import seaborn as sns #graphing
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

print("For this assignment we used the Abalone data set from the UCI Machine Learning Repository.\nhttps://archive.ics.uci.edu/ml/datasets/Abalone")
print("Our data set contains 9 features consisting of 8 quantitative features and 1 categorical feature (sex)")
df = pd.read_csv("Abalone.csv") #read in data
df.head()

For this assignment we used the Abalone data set from the UCI Machine Learning Repository.
https://archive.ics.uci.edu/ml/datasets/Abalone
Our data set contains 9 features consisting of 8 quantitative features and 1 categorical feature (sex)


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [55]:
#preprocessing
print("We note that the sex column contains a third (intersex) class, so we eliminate rows of that class.")
df = df[df.Sex != 'I']
df.head()

We note that the sex column contains a third (intersex) class, so we eliminate rows of that class.


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20


In [56]:
#set independent and dependent variables
x = df.iloc[:,1:9] #all entries from column 1 to 3
leng = df.iloc[:,1]
dim = df.iloc[:,2]
rin = df.iloc[:,3]
print(x)


      Length  Diameter  Height  Whole weight  Shucked weight  Viscera weight  \
0      0.455     0.365   0.095        0.5140          0.2245          0.1010   
1      0.350     0.265   0.090        0.2255          0.0995          0.0485   
2      0.530     0.420   0.135        0.6770          0.2565          0.1415   
3      0.440     0.365   0.125        0.5160          0.2155          0.1140   
6      0.530     0.415   0.150        0.7775          0.2370          0.1415   
...      ...       ...     ...           ...             ...             ...   
4172   0.565     0.450   0.165        0.8870          0.3700          0.2390   
4173   0.590     0.440   0.135        0.9660          0.4390          0.2145   
4174   0.600     0.475   0.205        1.1760          0.5255          0.2875   
4175   0.625     0.485   0.150        1.0945          0.5310          0.2610   
4176   0.710     0.555   0.195        1.9485          0.9455          0.3765   

      Shell weight  Rings  
0          

In [57]:
y = df.iloc[:,0]
print(y)

0       M
1       M
2       F
3       M
6       F
       ..
4172    F
4173    M
4174    M
4175    F
4176    M
Name: Sex, Length: 2835, dtype: object


In [58]:
#train test split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 0) #20% testing

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train) 
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)  # apply same transformation to test data

In [59]:
#training with Logistic Regression
clf = SGDClassifier(loss="log", penalty="l1", max_iter=100, alpha = .01, tol = .0001)
clf.fit(x_train, y_train)
SGDClassifier(max_iter=100000)
clf.predict(x_test)

array(['F', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M',
       'M', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M',
       'F', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'M',
       'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F',
       'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'M',
       'F', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F',
       'M', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F

In [60]:
print("Logistic Regression: ")
score = clf.score(x_test, y_test)
print("Training score: ", score) 
print(clf.coef_)
print("Intercept: ", clf.intercept_)
print("Iterations to Converge = ", clf.n_iter_)
print("Solution = w = ", 0.18529409, " - ", 0.30473596, "*x2 - ", 0.08525884,"*x3 + ", 0.1772645, "*x5" )

Logistic Regression: 
Training score:  0.5291005291005291
[[ 0.         -0.27561907 -0.04999171  0.          0.16457646  0.
   0.         -0.04466897]]
Intercept:  [0.18135436]
Iterations to Converge =  21
Solution = w =  0.18529409  -  0.30473596 *x2 -  0.08525884 *x3 +  0.1772645 *x5


In [61]:
print("Logistic Regression: ")
ypred = clf.predict(x_train)

#confusion matrix
cm = sklearn.metrics.confusion_matrix(y_train, ypred)
tn, fp, fn, tp = cm.ravel()
print(cm) 

Logistic Regression: 
[[ 214  822]
 [ 155 1077]]


In [62]:
print("Evaluation Metrics for Training Set: ")
cr = classification_report(y_train, ypred)
print(cr) 
print("Misclassification Rate = ", (fp + fn)/ (tp + tn + fp + fn))
print("Learning Rate = ", .01)

Evaluation Metrics for Training Set: 
              precision    recall  f1-score   support

           F       0.58      0.21      0.30      1036
           M       0.57      0.87      0.69      1232

    accuracy                           0.57      2268
   macro avg       0.57      0.54      0.50      2268
weighted avg       0.57      0.57      0.51      2268

Misclassification Rate =  0.43077601410934746
Learning Rate =  0.01


In [63]:
#confusion matrix
ypred = clf.predict(x_test)
cm = sklearn.metrics.confusion_matrix(y_test, ypred)
tn, fp, fn, tp = cm.ravel()
print(cm) 

print("Evaluation Metrics for Test Set: ")

cr = classification_report(y_test, ypred)
print(cr) 
print("Misclassification Rate = ", (fp + fn)/ (tp + tn + fp + fn))
print("Learning Rate = ", .01)

[[ 50 221]
 [ 46 250]]
Evaluation Metrics for Test Set: 
              precision    recall  f1-score   support

           F       0.52      0.18      0.27       271
           M       0.53      0.84      0.65       296

    accuracy                           0.53       567
   macro avg       0.53      0.51      0.46       567
weighted avg       0.53      0.53      0.47       567

Misclassification Rate =  0.4708994708994709
Learning Rate =  0.01


In [64]:
#training with KNN
neigh = KNeighborsClassifier(n_neighbors=100)
neigh.fit(x_train, y_train)
neigh.predict(x_test)

array(['F', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'M',
       'M', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M',
       'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'F',
       'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'F',
       'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'F',
       'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M', 'F',
       'M', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'F',
       'M', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'M',
       'M', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'M',
       'F', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'F',
       'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F

In [65]:
print("Evaluation Metrics:")

print("\nEvaluation Metrics for Training Set: ")

print("k = 100")
score = neigh.score(x_train, y_train)
print("Mean Accuracy = ", score)
print("Misclassificaiton Rate = ", 1 - score)

#confusion matrix
ypred = neigh.predict(x_train)
cm = sklearn.metrics.confusion_matrix(y_train, ypred)
tn, fp, fn, tp = cm.ravel()
print(cm) 

cr = classification_report(y_train, ypred)
print(cr) 
print("Misclassification Rate = ", (fp + fn)/ (tp + tn + fp + fn))

print("\nEvaluation Metrics for Test Set: ")

print("k = 100")
score = neigh.score(x_test, y_test)
print("Mean Accuracy = ", score)
print("Misclassificaiton Rate = ", 1 - score)



#confusion matrix
ypred = neigh.predict(x_test)
cm = sklearn.metrics.confusion_matrix(y_test, ypred)
tn, fp, fn, tp = cm.ravel()
print(cm) 

cr = classification_report(y_test, ypred)
print(cr) 
print("Misclassification Rate = ", (fp + fn)/ (tp + tn + fp + fn))

Evaluation Metrics:

Evaluation Metrics for Training Set: 
k = 100
Mean Accuracy =  0.5727513227513228
Misclassificaiton Rate =  0.4272486772486772
[[406 630]
 [339 893]]
              precision    recall  f1-score   support

           F       0.54      0.39      0.46      1036
           M       0.59      0.72      0.65      1232

    accuracy                           0.57      2268
   macro avg       0.57      0.56      0.55      2268
weighted avg       0.57      0.57      0.56      2268

Misclassification Rate =  0.42724867724867727

Evaluation Metrics for Test Set: 
k = 100
Mean Accuracy =  0.5485008818342152
Misclassificaiton Rate =  0.4514991181657848
[[ 98 173]
 [ 83 213]]
              precision    recall  f1-score   support

           F       0.54      0.36      0.43       271
           M       0.55      0.72      0.62       296

    accuracy                           0.55       567
   macro avg       0.55      0.54      0.53       567
weighted avg       0.55      0.55    

In [66]:
#testing other k values
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)

print("Evaluation Metrics:")

print("\nEvaluation Metrics for Training Set: ")

print("k = 3")
score = neigh.score(x_train, y_train)
print("Mean Accuracy = ", score)
print("Misclassificaiton Rate = ", 1 - score)

#confusion matrix
ypred = neigh.predict(x_train)
cm = sklearn.metrics.confusion_matrix(y_train, ypred)
tn, fp, fn, tp = cm.ravel()
print(cm) 

cr = classification_report(y_train, ypred)
print(cr) 
print("Misclassification Rate = ", (fp + fn)/ (tp + tn + fp + fn))

print("\nEvaluation Metrics for Test Set: ")

print("k = 3")
score = neigh.score(x_test, y_test)
print("Mean Accuracy = ", score)
print("Misclassificaiton Rate = ", 1 - score)



#confusion matrix
ypred = neigh.predict(x_test)
cm = sklearn.metrics.confusion_matrix(y_test, ypred)
tn, fp, fn, tp = cm.ravel()
print(cm) 

cr = classification_report(y_test, ypred)
print(cr) 
print("Misclassification Rate = ", (fp + fn)/ (tp + tn + fp + fn))

Evaluation Metrics:

Evaluation Metrics for Training Set: 
k = 3
Mean Accuracy =  0.767636684303351
Misclassificaiton Rate =  0.23236331569664903
[[750 286]
 [241 991]]
              precision    recall  f1-score   support

           F       0.76      0.72      0.74      1036
           M       0.78      0.80      0.79      1232

    accuracy                           0.77      2268
   macro avg       0.77      0.76      0.76      2268
weighted avg       0.77      0.77      0.77      2268

Misclassification Rate =  0.23236331569664903

Evaluation Metrics for Test Set: 
k = 3
Mean Accuracy =  0.5326278659611993
Misclassificaiton Rate =  0.46737213403880074
[[119 152]
 [113 183]]
              precision    recall  f1-score   support

           F       0.51      0.44      0.47       271
           M       0.55      0.62      0.58       296

    accuracy                           0.53       567
   macro avg       0.53      0.53      0.53       567
weighted avg       0.53      0.53      0

In [67]:
#testing other k values
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(x_train, y_train)

print("Evaluation Metrics:")

print("\nEvaluation Metrics for Training Set: ")

print("k = 5")
score = neigh.score(x_train, y_train)
print("Mean Accuracy = ", score)
print("Misclassificaiton Rate = ", 1 - score)

#confusion matrix
ypred = neigh.predict(x_train)
cm = sklearn.metrics.confusion_matrix(y_train, ypred)
tn, fp, fn, tp = cm.ravel()
print(cm) 

cr = classification_report(y_train, ypred)
print(cr) 
print("Misclassification Rate = ", (fp + fn)/ (tp + tn + fp + fn))

print("\nEvaluation Metrics for Test Set: ")

print("k = 5")
score = neigh.score(x_test, y_test)
print("Mean Accuracy = ", score)
print("Misclassificaiton Rate = ", 1 - score)



#confusion matrix
ypred = neigh.predict(x_test)
cm = sklearn.metrics.confusion_matrix(y_test, ypred)
tn, fp, fn, tp = cm.ravel()
print(cm) 

cr = classification_report(y_test, ypred)
print(cr) 
print("Misclassification Rate = ", (fp + fn)/ (tp + tn + fp + fn))

Evaluation Metrics:

Evaluation Metrics for Training Set: 
k = 5
Mean Accuracy =  0.7120811287477954
Misclassificaiton Rate =  0.2879188712522046
[[667 369]
 [284 948]]
              precision    recall  f1-score   support

           F       0.70      0.64      0.67      1036
           M       0.72      0.77      0.74      1232

    accuracy                           0.71      2268
   macro avg       0.71      0.71      0.71      2268
weighted avg       0.71      0.71      0.71      2268

Misclassification Rate =  0.2879188712522046

Evaluation Metrics for Test Set: 
k = 5
Mean Accuracy =  0.5220458553791887
Misclassificaiton Rate =  0.4779541446208113
[[110 161]
 [110 186]]
              precision    recall  f1-score   support

           F       0.50      0.41      0.45       271
           M       0.54      0.63      0.58       296

    accuracy                           0.52       567
   macro avg       0.52      0.52      0.51       567
weighted avg       0.52      0.52      0.5

In [68]:
print("Comparison of Logistic Regression and KNN")
print("Using k = 3, k = 100, and k = 5 for our KNN Classifier, we found that k = 3 had the highest accuracy rate for the Training dataset, but k = 100 had the highest accuracy for the test dataset.")

Comparison of Logistic Regression and KNN
Using k = 3, k = 100, and k = 5 for our KNN Classifier, we found that k = 3 had the highest accuracy rate for the Training dataset, but k = 100 had the highest accuracy for the test dataset.
