# K Nearest Neighbours & K Fold CV

In [5]:
# Function for distance between attribute vectors and mode
dist <- function(v1, v2){
  return(sqrt(sum((v1-v2)^2)))
}
mode <- function(lab,dist){
  return(names(which.max(table(lab))))
}

In [6]:
# function to calculate overall, positive, negative, accuracy and MCC
CalcAccus <- function(pred, true){
    tab = table(true, pred)
    TP = tab[1,1]
    FP = tab[2,1]
    FN = tab[1,2]
    TN = tab[2,2]
    
    accu_o = (TP+TN)/(TP+TN+FP+FN)
    accu_P = TP/(TP+FN)
    accu_N = TN/(TN+FP)
    MCC    = (TP*TN - FP*FN)/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
   
    return(c(accu_o, accu_P, accu_N, MCC))
}

In [7]:
# knn    - Function to Predict the label of data point given set of attributes
# Input  - 1) data -> data frame containing attributes and labels 
#                     (Labels in last column)
#          2) obj  -> New data point (set of attributes) for which label is 
#                     to be predicted / data frame with each row as a input set.
#          3) K    -> Algorithm parameter. Single value or list of different Ks.

# Output - Outputs the predicted label or a column of Predicted labels depending 
#          on input, if multiple values of k are given predictions for them are
#          added as separate columns.

knn <- function(data, obj, K){
    x_data = data[ ,-length(data)]
    y_data = data[ ,length(data)]
    label = c()
    for(each in 1:nrow(obj)){
        distance = c()
        for(row in 1:nrow(data)){
            distance[row] = dist(x_data[row,], obj[each,])
        }
        sort_order = order(distance)
        
        lab = c()
        for(k in K){
            sorted_label = y_data[sort_order][1:k]
            sorted_distance = distance[sort_order][1:k]
            lab = cbind(lab,mode(sorted_label, sorted_distance),deparse.level = 0)
        }
        
        label = rbind(label,lab,deparse.level = 0)
    }
    return(label)
}

In [8]:
# knn    - Function to Predict the label of data point given set of attributes
# Input  - 1) data  -> data frame containing attributes and labels 
#                      (Labels in last column)
#          2) k     -> Algorithm parameter. 
#          3) knn_k -> Algorithm parameter for prediction algorithm used. 
#                      Single value or list of different Ks.

# Output - Outputs single row or matrix where each row gives, 
#          Overall Accuracy, Positive Accuracy, Negative Accuracy 
#          & MCC for each Knn_k

kFold <- function(data, k, knn_k){
    data = data[sample(nrow(data)),]
    n = nrow(data) %/% k
    m = length(knn_k)
    
    PerfMat = matrix(0,m,5)
    for(i in 1:k){ #loop for creating different folds
        test = data[seq((1+(i-1)*n),(n*i)), ]
        x_test = test[,-ncol(test)]
        y_test =test[,ncol(test)]
        train = data[-seq((1+(i-1)*n),(n*i)), ]
    
        y_pred = knn(train, x_test, knn_k)
    
        PerfMatK = matrix(0,m,5)
        PerfMatK[,1] = knn_k
        for(i in 1:m){ #loop over predictions for different values of knn_k
            PerfMatK[i,2:5] = CalcAccus(y_pred[,i], y_test)
        }
        PerfMat = PerfMat + PerfMatK
        #adding the accuracies of each fold which will be later devided by
        #number of folds to get average accuracy
    }
    colnames(PerfMat) = c("K","Overall Accuracy", "Positive Accuracy", 
                          "Negative Accuracy", "MCC")
    return(PerfMat/k)
}

In [28]:
# data pre-prosessing - reading data, removing unnecessary cols, moving labels to last col
data <- read.csv("data.csv", na.strings = c('',"NA"))
data = data[,c(-1,-ncol(data))]
data = data[,c(2:ncol(data),1)]

#data normalisation
for(i in 1:(ncol(data)-1)){
    data[,i] = (data[,i] - min(data[,i]))/(max(data[,i]) - min(data[,i]))
}

# shuffling the data points and spliting in test and train data at random
set.seed(123)
shuf  = sample(2, nrow(data), replace = T, prob = c(0.9,0.1))
train = data[which(shuf == 1), ]
test  = data[which(shuf != 1), ]
x_test = test[,-length(test)]
y_test = test[,length(test)]

In [15]:
# Using K-Fold for choosing best value of KNN Parameter K
k = seq(1,17,2)
mat = kFold(train, 5, k)

In [17]:
mat

K,Overall Accuracy,Positive Accuracy,Negative Accuracy,MCC
1,0.9533981,0.963658,0.9363905,0.9010002
3,0.9631068,0.9819539,0.9314277,0.9207649
5,0.9631068,0.9819539,0.9314277,0.9207649
7,0.9669903,0.9849389,0.9369753,0.9296568
9,0.9650485,0.9879692,0.9260144,0.9250973
11,0.9592233,0.9848937,0.9156969,0.9132436
13,0.9650485,0.9909091,0.9209601,0.9255295
15,0.9669903,0.9969697,0.9156969,0.9299374
17,0.9669903,0.9939394,0.9212525,0.9298599


In [30]:
# Using MCC as performance metrics
best_K = mat[which.max(mat[,'MCC']),'K']

In [31]:
# Using the best_K found above to predict for test data
st = Sys.time()
y_pred = knn(train, x_test, best_K)
ed = Sys.time()

In [36]:
Performance = CalcAccus(y_pred,y_test)

In [38]:
print(paste("K =",Performance[0],"Overall Accuracy =",Performance[1], 
            "Positive Accuracy =",Performance[2], 
            "Negative Accuracy =",Performance[3], "MCC =",Performance[4]))

[1] "K =  Overall Accuracy = 0.981481481481482 Positive Accuracy = 1 Negative Accuracy = 0.956521739130435 MCC = 0.962616452582548"
