In [2]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy.stats as stats
from scipy.stats import mode
from sklearn.model_selection import train_test_split
% matplotlib inline

In [3]:
data = np.genfromtxt('breast_cancer_wisconsin_dataset.txt', dtype ='float', delimiter=",")
print np.shape(data)

#  The wisconsin breast cancer data set is from the UCI ML repository
#  It's 9 features are as follows:
#  Clump Thickness, Uniformity of Cell Size, Uniformity of Cell Shape,
#  Marginal Adhesion, Single Epithelial Cell Size, Bare Nuclei, 
#  Bland Chromatin, Normal Nucleoli, and Mitoses.
#  Each is an integer between 1 and 10.
#  The label is whether the tumor is malignant or benign.
# https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names

X = data[:,1:10].reshape([699, 9])
Y = data[:,10].reshape([699, 1])
Y = np.where(Y == 2 , 1, 0)
posLabel = '2: benign'
negLabel = '4: malignant'
dataset_name = 'breast_cancer_wisconsin_dataset'

print X[1:5,:]
print np.shape(X)
print np.shape(Y)

(699, 11)
[[  5.   4.   4.   5.   7.  10.   3.   2.   1.]
 [  3.   1.   1.   1.   2.   2.   3.   1.   1.]
 [  6.   8.   8.   1.   3.   4.   3.   7.   1.]
 [  4.   1.   1.   3.   2.   1.   3.   1.   1.]]
(699, 9)
(699, 1)


In [4]:
data = np.genfromtxt('wine.txt', dtype ='float', delimiter=",")
print np.shape(data)
np.set_printoptions(suppress=True)

#      "These data are the results of a chemical analysis of
#      wines grown in the same region in Italy but derived from three
#      different cultivars.
#      The analysis determined the quantities of 13 constituents
#      found in each of the three types of wines." - UCI repository
#      The features are as follows: 
#      Alcohol, Malic acid, Ash, Alcalinity of ash, Magnesium
#      Total phenols, Flavanoids, Nonflavanoid phenols, Proanthocyanins
#      Color intensity, Hue, OD280/OD315 of diluted wines, Proline 
#      https://archive.ics.uci.edu/ml/datasets/wine


X = data[:,1:].reshape([178, 13])
Y = data[:,0].reshape([178, 1])
Y = np.where(Y == 1 , 1, 0)
posLabel = '1: location1'
negLabel = '2 & 3: location2 and location3' 
dataset_name = 'wine_location'

print np.shape(X)
print np.shape(Y)

(178, 14)
(178, 13)
(178, 1)


In [5]:
data = np.genfromtxt('winequality_white_tab.txt', dtype ='float', delimiter=";")
print np.shape(data)

# "Two datasets are included, related to red and white vinho verde wine samples,
# from the north of Portugal. The goal is to model wine quality based
# on physicochemical tests (see [Cortez et al., 2009])" - UCI repository
# It's 11 features are as follows:
# fixed acidity, volatile acidity, citric acid, residual sugar, chlorides
# free sulfur dioxide, total sulfur dioxide, density
# pH, sulphates, alcohol.
# http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names

X = data[:,0:11]
Y = data[:,11].reshape([4898, 1])

Y = np.where(Y >=5 , 1, 0)
posLabel = 'high quality'
negLabel = 'low quality'
dataset_name = 'winequality_white'

print np.shape(X)
print np.shape(Y)

(4898, 12)
(4898, 11)
(4898, 1)


In [6]:
col_mean = stats.nanmean(X,axis=0)
inds = np.where(np.isnan(X))
X[inds]=np.take(col_mean,inds[1])

In [7]:
def euclidDistance(Xset, point_being_classified):
    distance = np.sqrt(np.sum(np.power(Xset - point_being_classified,2),axis = 1))
    return distance 

In [8]:
def getNearestK(Xset, point_being_classified, k):
    distance = euclidDistance(Xset,point_being_classified)
    indecies_of_nearest_k = distance.argsort()[:k]
    return indecies_of_nearest_k

In [9]:
def getPointLabel(Xset, Yset, point_being_classified, k):
    indecies_of_nearest_k = getNearestK(Xset, point_being_classified, k)
    labels_of_nearest_k = Yset[indecies_of_nearest_k]
    classification_of_point = mode(labels_of_nearest_k)[0][0]
    return classification_of_point

In [10]:
def KNN(Xtrain,Xtest,ytrain,ytest,k):
    predicted_labels = np.zeros((len(ytest)))
    for row in range(len(ytest)):
        point_being_classified = Xtest[row,:]
        predicted_labels[row] = getPointLabel(Xtrain, ytrain, point_being_classified, k)

    errors_list = np.where(predicted_labels != ytest, 1, 0 )
    error = (1.0/float(len(ytest)))*np.sum(errors_list)
    return error

This is KNN with kfold cross-validation

In [86]:
kfold = 5
knn = np.array([4,5,6])#,7,8,9,10,11,12,13,14,15,16,17,18,19,20])

output_table = np.zeros((len(knn),3))
output_table[:,0] = knn

error = np.zeros((1,kfold))

number_of_error_Calculations = 1
errorCalcAverage = np.zeros((1,number_of_error_Calculations))
errorCalcVar = np.zeros((1,number_of_error_Calculations))

for k_iter, knn in enumerate(knn):

    for m in range(number_of_error_Calculations):

        full_training_dataset_with_labels = np.hstack((X,Y))
        np.random.shuffle(full_training_dataset_with_labels)
        CV_set = full_training_dataset_with_labels

        for i in range(kfold):

            datasetDimensions = np.shape(X)
            datasetLength = datasetDimensions[0]
            datasetWidth = datasetDimensions[1]

            batchSize = datasetLength/kfold
            index1 = int(batchSize*i)
            index2 = int(batchSize*(i+1)) 

            if i == max(range(kfold)):
                CV_set_train = np.delete(CV_set, np.s_[index1:] ,0)
                CV_set_test = CV_set[index1:,:]

            else: 

                CV_set_train = np.delete(CV_set, np.s_[index1:index2] ,0)
                CV_set_test = CV_set[index1:index2,:]


            CV_X_train = CV_set_train[:,0:datasetWidth]
            CV_y_train = CV_set_train[:,datasetWidth]

            CV_X_test =  CV_set_test[:,0:datasetWidth]
            CV_y_test =  CV_set_test[:,datasetWidth]


            error[0,i] = KNN(CV_X_train, CV_X_test, CV_y_train, CV_y_test, knn)  

        avgCVError = (1.0/float(kfold))*np.sum(error)

        errorCalcAverage[0,m] = avgCVError

    errorCalcAverageTot = (1.0/float(number_of_error_Calculations))*np.sum(errorCalcAverage)

    output_table[k_iter, 1] = errorCalcAverageTot
    output_table[k_iter, 2] = 1 - errorCalcAverageTot

print "Dataset:", dataset_name
print "This is KNN with CV with kfold = ", kfold
print "The positive label is", posLabel, "and the negative label is", negLabel   
print "The number of error calculations is", number_of_error_Calculations 
print "The table below lists each k with their associated validation error and accuracy"
print np.around(output_table, 3)

Dataset: winequality_white
This is KNN with CV with kfold =  5
The positive label is high quality and the negative label is low quality
The number of error calculations is 1
The table below lists each k with their associated validation error and accuracy
[[ 4.     0.045  0.955]
 [ 5.     0.039  0.961]
 [ 6.     0.043  0.957]]


This is KNN with variable training/testing ratios

In [11]:
ratio_list = np.array([0.6,0.7,0.8]) #0.1,0.2,0.3,0.4,0.5,0.9,0.95
testing_error_for_ratios = np.zeros((len(ratio_list),3))
testing_error_for_ratios[:,0] = ratio_list 

knn = 5

    
for q_iter, trainingRatio in enumerate(ratio_list):

    testingRatio = 1.00 - trainingRatio


    number_of_error_Calculations = 5

    error = np.zeros((1,number_of_error_Calculations))

    errorCalcAverage = np.zeros((1,number_of_error_Calculations))
    errorCalcVar = np.zeros((1,number_of_error_Calculations))


    for i in range(number_of_error_Calculations):

        dataset_with_labels = np.hstack((X,Y))
        np.random.shuffle(dataset_with_labels)

        datasetDimensions = np.shape(X)
        datasetLength = datasetDimensions[0]
        datasetWidth = datasetDimensions[1]

        trainingSetLength = int(datasetLength*trainingRatio)
        trainingSetWidth = datasetWidth 

        Xy_train = dataset_with_labels[0:trainingSetLength,:]
        Xy_test = dataset_with_labels[trainingSetLength:,:]

        X_train = Xy_train[:,0:trainingSetWidth]
        y_train = Xy_train[:,trainingSetWidth]

        X_test = Xy_test[:,0:trainingSetWidth]
        y_test = Xy_test[:,trainingSetWidth]

        error[0,i] = KNN(X_train, X_test, y_train, y_test, knn)

    # here the average and varience of errors is calculated from a bunch of different trained svms each trained on a 
    # the same dataset but where the full data set is randomized and then split into training and testing sets
    # so each training and testing set that each SVM is trained on is different. These metrics below are the average
    # and varience of the errors that are produced from the different svms. 


    average_of_errors = (1.0/float(number_of_error_Calculations) )* np.sum(error)
    
    testing_error_for_ratios[q_iter,1] = average_of_errors
    testing_error_for_ratios[q_iter,2] = 1- average_of_errors

print "Dataset:", dataset_name
print "This is KNN with k = ", knn #," and with training/testing ratio: ", trainingRatio, testingRatio
print "The positive label is", posLabel, "and the negative label is", negLabel 
print "The number of error calculations is", number_of_error_Calculations
print "The training/testing ratio is the first column"
print "and the testing error average is the second column"
print "and the testing accuracy average is the third column"
print np.around(testing_error_for_ratios, 3)

Dataset: winequality_white
This is KNN with k =  5
The positive label is high quality and the negative label is low quality
The number of error calculations is 5
The training/testing ratio is the first column
and the testing error average is the second column
and the testing accuracy average is the third column
[[ 0.6    0.04   0.96 ]
 [ 0.7    0.042  0.958]
 [ 0.8    0.043  0.957]]
