In [26]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import *
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import pandas as pd

In [27]:
import pandas as pd
import numpy as np

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def knn_classification(X_train, y_train, X_test, k):
    y_pred = []
    for x_test in X_test:
        distances = []
        for i, x_train in enumerate(X_train):
            distance = euclidean_distance(x_test, x_train)
            distances.append((distance, i))
        print("Distances for all test points: ",distances)
        distances.sort()
        neighbors_indices = [index for a, index in distances[:k]] #Take the first k points
        neighbor_labels = [y_train[i] for i in neighbors_indices]
        y_pred.append(max(set(neighbor_labels), key=neighbor_labels.count)) #Select the maximum occuring label
    print("All test label: ", np.array(y_pred), "Len: ", len(np.array(y_pred)))
    return np.array(y_pred)

In [28]:
zoo_df = pdzoo_df = pd.read_csv("/home/zoo.csv")
class_df = pd.read_csv("/home/class.csv")
zoo_df = zoo_df.merge(class_df, how='left', left_on='class_type', right_on='Class_Number')


In [29]:
# Dropping unwanted columns
zoo_df = zoo_df.drop(['class_type','Animal_Names', 'Number_Of_Animal_Species_In_Class', 'Class_Number', 'animal_name'], axis=1)

In [30]:
zoo_df.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,Class_Type
0,1,0,0,1,0,0,1,1,1,1,0.0,0,4,0,0,1,Mammal
1,1,0,0,1,0,0,0,1,1,1,0.0,0,4,1,0,1,Mammal
2,0,0,1,0,0,1,1,1,1,0,0.0,1,0,1,0,0,Fish
3,1,0,0,1,0,0,1,1,1,1,0.0,0,4,0,0,1,Mammal
4,1,0,0,1,0,0,1,1,1,1,0.0,0,4,1,0,1,Mammal


In [31]:
# Preprocessing the target variable
from sklearn.preprocessing import LabelEncoder
data = zoo_df['Class_Type']
label_encoder = LabelEncoder()

In [32]:
zoo_df.isnull().any()

hair          False
feathers      False
eggs          False
milk          False
airborne      False
aquatic       False
predator      False
toothed       False
backbone      False
breathes      False
venomous       True
fins          False
legs          False
tail          False
domestic      False
catsize       False
Class_Type    False
dtype: bool

In [33]:
mean = zoo_df['venomous'].mean()
zoo_df['venomous'].fillna(mean)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
96     0.0
97     1.0
98     0.0
99     0.0
100    0.0
Name: venomous, Length: 101, dtype: float64

In [34]:
# Fit and transform the data to numerical labels
zoo_df['Class_Type'] = label_encoder.fit_transform(data)

# Select columns to add to X and y sets
features = list(zoo_df.columns.values)
features.remove('Class_Type')
X = zoo_df[features]
y = zoo_df['Class_Type']

In [35]:
# Split X and y into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Set k value
k = 5

In [36]:
# Run k-nearest neighbors classification
y_pred = knn_classification(X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), k)
print("Y-test: ", np.array(y_test), "Len: ", len(np.array(y_test)))

Distances for all test points:  [(3.0, 0), (3.3166247903554, 1), (4.58257569495584, 2), (4.358898943540674, 3), (4.58257569495584, 4), (4.358898943540674, 5), (1.7320508075688772, 6), (2.8284271247461903, 7), (3.3166247903554, 8), (3.4641016151377544, 9), (4.58257569495584, 10), (3.3166247903554, 11), (2.449489742783178, 12), (4.69041575982343, 13), (2.8284271247461903, 14), (4.47213595499958, 15), (3.0, 16), (3.0, 17), (2.6457513110645907, 18), (2.8284271247461903, 19), (2.8284271247461903, 20), (3.3166247903554, 21), (4.47213595499958, 22), (nan, 23), (4.58257569495584, 24), (3.3166247903554, 25), (2.449489742783178, 26), (4.58257569495584, 27), (2.8284271247461903, 28), (2.6457513110645907, 29), (3.4641016151377544, 30), (2.8284271247461903, 31), (3.4641016151377544, 32), (1.4142135623730951, 33), (2.6457513110645907, 34), (3.0, 35), (4.69041575982343, 36), (3.4641016151377544, 37), (3.0, 38), (2.23606797749979, 39), (3.3166247903554, 40), (3.3166247903554, 41), (2.0, 42), (2.449489

Manual calculation of Accuracy

In [37]:
test_set = np.array(y_test)
pred_set = np.array(y_pred)
n1 = len(np.array(y_test))
n2 = len(np.array(y_pred))

correctly_classified = 0
total_classification = n1

for i in range(n1):
  if test_set[i] == pred_set[i]:
    correctly_classified += 1

accuracy = (correctly_classified / total_classification)*100

print("Accuracy: ", accuracy)

Accuracy:  84.61538461538461


Confusion matrix

In [40]:
def confusion_matrix(test_set, pred_set):
  cmat = np.zeros((7, 7)) #creating a matrix where all the elements are zeros with the order of len(test_set) X len(pred_set)
  class_1 = 0
  class_2 = 0
  class_3 = 0
  class_4 = 0
  class_5 = 0
  class_6 = 0
  class_7 = 0
  for i in range(len(test_set)):
    if test_set[i] == pred_set[i] == 0:
      class_1 += 1
      cmat[0][0] = class_1
    elif test_set[i] == pred_set[i] == 1:
      class_2 += 1
      cmat[1][1] = class_2
    elif test_set[i] == pred_set[i] == 2:
      class_3 += 1
      cmat[2][2] = class_3
    elif test_set[i] == pred_set[i] == 3:
      class_4 += 1
      cmat[3][3] = class_4
    elif test_set[i] == pred_set[i] == 4:
      class_5 += 1
      cmat[4][4] = class_5
    elif test_set[i] == pred_set[i] == 5:
      class_6 += 1
      cmat[5][5] = class_6
    else:
      class_7 += 1
      cmat[6][6] = class_7



  print("The confusion matrix for the dataset is: ")
  print(cmat)



In [41]:
confusion_matrix(test_set, pred_set)

The confusion matrix for the dataset is: 
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  5.  0.  0.  0.  0.  0.]
 [ 0.  0.  2.  0.  0.  0.  0.]
 [ 0.  0.  0.  5.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. 10.  0.]
 [ 0.  0.  0.  0.  0.  0.  4.]]


In [None]:
print("Test set:", test_set)
print("Pred set",pred_set)

Test set: [0 3 3 5 5 5 1 3 5 5 4 5 1 4 3 2 5 2 1 3 1 6 5 1 5 5]
Pred set [5 3 3 5 5 5 1 3 5 5 2 5 1 3 3 2 5 2 1 3 1 3 5 1 5 5]


KeyError: ignored