In [78]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [61]:
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
# using the attribute information as the column names
col_names = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width','Class']
iris_df =  pd.read_csv(csv_url, names = col_names)

In [62]:
iris_df.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [63]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

iris_df["Class"] = le.fit_transform(iris_df["Class"])

In [64]:
iris_df.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [65]:
iris_df.describe()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,1.0
std,0.828066,0.433594,1.76442,0.763161,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [66]:
from sklearn.model_selection import train_test_split
X = iris_df.iloc[:,0:4]
Y = iris_df.iloc[:,4]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


In [67]:
train_size = x_train.shape[0]
class_priors = dict()
print(train_size)
for outcome in np.unique(y_train):
    # print(outcome)n
    outcome_count = sum(y_train==outcome)
    # print(outcome_count)
    # print(outcome)
    # print(class_priors)
    class_priors[outcome] = outcome_count / train_size
print(class_priors) 

120
{0: 0.3333333333333333, 1: 0.3416666666666667, 2: 0.325}


In [68]:
features = list(x_train.columns) 
likelihoods = dict()
# print(features)

for outcome in np.unique(y_train):
    outcome_count = sum(y_train==outcome)
    # print(outcome_count)
    for feature in features:
        # print('feature = ', feature)
        for feat_value in np.unique(x_train[feature]):
            # print('feat_value = ', feat_value)
            # print('outcome_count = ', outcome_count)
            count = 0 
            for i in x_train.index:

                if (x_train[feature][i]==feat_value and y_train[i]==outcome):
                    count+=1
            # print(count)
            likelihoods[(feature,feat_value,outcome)]=(count+1)/(outcome_count+(len(features)))


In [69]:
print(likelihoods)


{('Sepal_Length', 4.3, 0): 0.045454545454545456, ('Sepal_Length', 4.4, 0): 0.09090909090909091, ('Sepal_Length', 4.5, 0): 0.045454545454545456, ('Sepal_Length', 4.6, 0): 0.11363636363636363, ('Sepal_Length', 4.7, 0): 0.045454545454545456, ('Sepal_Length', 4.8, 0): 0.06818181818181818, ('Sepal_Length', 4.9, 0): 0.09090909090909091, ('Sepal_Length', 5.0, 0): 0.18181818181818182, ('Sepal_Length', 5.1, 0): 0.18181818181818182, ('Sepal_Length', 5.2, 0): 0.09090909090909091, ('Sepal_Length', 5.3, 0): 0.045454545454545456, ('Sepal_Length', 5.4, 0): 0.11363636363636363, ('Sepal_Length', 5.5, 0): 0.045454545454545456, ('Sepal_Length', 5.6, 0): 0.022727272727272728, ('Sepal_Length', 5.7, 0): 0.045454545454545456, ('Sepal_Length', 5.8, 0): 0.045454545454545456, ('Sepal_Length', 5.9, 0): 0.022727272727272728, ('Sepal_Length', 6.0, 0): 0.022727272727272728, ('Sepal_Length', 6.1, 0): 0.022727272727272728, ('Sepal_Length', 6.2, 0): 0.022727272727272728, ('Sepal_Length', 6.3, 0): 0.022727272727272728,

In [70]:
print(len(likelihoods))

357


In [71]:
print(likelihoods[('Sepal_Length', 4.4, 0)])
print(likelihoods[('Sepal_Length', 4.4, 1)])
print(likelihoods[('Sepal_Length', 4.4, 2)])

0.09090909090909091
0.022222222222222223
0.023255813953488372


In [72]:
a = len(np.unique(y_train))
prob =  np.ones((a, len(x_test)), dtype=float)

for outcome in (np.unique(y_train)):
    print('outcome = ', outcome)
    outcome_count = sum(y_train==outcome)
    print('outcome_count = ', outcome_count)
    for feature in features:
        # print('feature = ', feature)
        for i in x_test.index:
            
            if (feature, x_test[feature][i], outcome) in likelihoods.keys():
                try:
                    prob[outcome][i] = prob[outcome][i] * likelihoods[(feature, x_test[feature][i], outcome)]
                except:
                    continue
            else:
                try:
                    prob[outcome][i] = prob[outcome][i] * (1/(outcome_count+len(features)))
                except:
                    continue
        

outcome =  0
outcome_count =  40
outcome =  1
outcome_count =  41
outcome =  2
outcome_count =  39


In [73]:
for i in range(prob.shape[0]):
    prob[i][:] = prob[i][:] * class_priors[i]
# print(class_priors)
# print(prob[0][:])
# print(prob[1][:])
# print(prob[2][:])

y_label = np.zeros(len(y_test))
for i in range(len(x_test)):
    # print(prob[1,i])
    if (prob[0,i] >= prob[1,i]) and (prob[0,i] >= prob[2,i]):
        y_label[i] = 0

    if (prob[1,i] >= prob[0,i]) and (prob[1,i] >= prob[2,i]):
        y_label[i] = 1

    if (prob[2,i] >= prob[0,i]) and (prob[2,i] >= prob[1,i]):
        y_label[i] = 2

In [74]:
print(np.unique(y_label))

[0. 1.]


In [75]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_label)) 
print(metrics.confusion_matrix(y_test,y_label))

              precision    recall  f1-score   support

           0       0.33      0.20      0.25        10
           1       0.29      0.78      0.42         9
           2       0.00      0.00      0.00        11

    accuracy                           0.30        30
   macro avg       0.21      0.33      0.22        30
weighted avg       0.20      0.30      0.21        30

[[2 8 0]
 [2 7 0]
 [2 9 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x_train, y_train)
y_label = clf.predict(x_test)

In [77]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_label)) 
print(metrics.confusion_matrix(y_test,y_label))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [79]:
import numpy as np
import heapq
import scipy

y_label=[]
for i in x_test.index:
    x = np.array(x_test.loc[i]).reshape(1,-1)
    # print(np.array(x_test.loc[i]).reshape(1,-1))
    ary = scipy.spatial.distance.cdist(x_train, x, metric='euclidean') 
    # # print(ary)
    indx = heapq.nsmallest(11, range(len(ary)), ary.take) 
    # # print(indx)
    y_neighbors=[]
    for k in range(len(indx)):
        try:
            y_neighbors.append(y_train[indx[k]]) 
        except:
            continue
    y_label.append(max(set(y_neighbors), key = y_neighbors.count))
    
# print(len(ary))
# print(len(indx))


In [80]:
y_test.shape

(30,)

In [81]:
len(y_label)

30

In [82]:
# y_label

In [83]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_label)) 
print(metrics.confusion_matrix(y_test,y_label))

              precision    recall  f1-score   support

           0       0.54      0.70      0.61        10
           1       0.31      0.56      0.40         9
           2       1.00      0.09      0.17        11

    accuracy                           0.43        30
   macro avg       0.62      0.45      0.39        30
weighted avg       0.64      0.43      0.38        30

[[7 3 0]
 [4 5 0]
 [2 8 1]]


In [84]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [85]:
knn = KNeighborsClassifier()
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
  
# defining parameter range
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False, verbose=1)
  
# fitting the model for grid search
grid_search=grid.fit(x_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


In [86]:
print(grid_search.best_params_)

{'n_neighbors': 11}


In [87]:
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )

Accuracy for our training dataset with tuning is : 95.83%


In [88]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(x_train, y_train)
y_label = knn.predict(x_test)

In [89]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_label)) 
print(metrics.confusion_matrix(y_test,y_label))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
