In [5]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

#get the datasets directly from UCI's website
url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data'
dataframe_train = (pd.read_csv(url_train, delim_whitespace=True, header=None).replace("?", np.NaN))
url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.test'
dataframe_test = (pd.read_csv(url_test, delim_whitespace=True, header=None).replace("?", np.NaN))

#drop useless data in both training and testing datasets
dataframe_train.drop(index=[132], axis=1, inplace=True)
dataframe_test.drop(index=[8], axis=1, inplace=True)
#assign information to x, and answers to y in both datasets
train_x = dataframe_train.drop(dataframe_train.columns[[2,22,23,24,25,26,27]], axis=1)
train_y = dataframe_train[22]
test_x = dataframe_test.drop(dataframe_test.columns[[2,22,23,24,25,26,27]], axis=1)
test_y = dataframe_test[22]

#convert answers from dataframe into numpy array 
train_y = train_y.to_numpy()
test_y = test_y.to_numpy()

#since we group "euthanized" with "died", we might as well treat "euthanized" as "died"
for i in range(train_y.size):
  if train_y[i] == "3":
    train_y[i] = "2"
for i in range(test_y.size):
  if test_y[i] == "3":
    test_y[i] = "2"

#implement KNNImputer for missing data in the information part of both training and testing datasets, both using the same parameters
imputer = KNNImputer(n_neighbors=5)
train_x = imputer.fit_transform(train_x)
test_x = imputer.fit_transform(test_x)

#use min-max scaler for normalization in the information part of both training and testing datasets
scaler = MinMaxScaler(feature_range=(0, 1))
train_x = scaler.fit_transform(train_x)
test_x = scaler.fit_transform(test_x)

#use KNN
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto')
knn.fit(train_x, train_y)

#print accuracy score for test dataset
score = accuracy_score(knn.predict(test_x), test_y)
print(score)

0.7910447761194029
