# K Nearest Neighbours

In [2]:
# Function for distance between attribute vectors and mode
dist <- function(v1, v2){
  return(sqrt(sum((v1-v2)^2)))
}
mode <- function(lab,dist){
  return(names(which.max(table(lab))))
}

In [3]:
# function to calculate overall, positive, negative, accuracy and MCC
CalcAccus <- function(pred, true, k){
    tab = table(true, pred)
    TP = tab[1,1]
    FP = tab[2,1]
    FN = tab[1,2]
    TN = tab[2,2]
    
    accu_o = (TP+TN)/(TP+TN+FP+FN)
    accu_P = TP/(TP+FN)
    accu_N = TN/(TN+FP)
    MCC    = (TP*TN - FP*FN)/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    
    x <- data.frame("k" = k, "overall" = accu_o, "positive" = accu_P, "negative" = accu_N, "MCC" = MCC)     
    return(x)
}

### Function for KNN
#### Input
- data: input data in form of data frame with labels in last col  
- obj: input objects {set of attributes} for which predictions are to be made;also a data frame with one or more rows with attributes  
- k: algorithm parameter 

#### Output
- List to predicted labels

In [4]:
knn <- function(data, obj, k){
    x_data = data[ ,-length(data)]
    y_data = data[ ,length(data)]
    label = c()
    for(each in 1:nrow(obj)){
        distance = c()
        for(row in 1:nrow(data)){
            distance[row] = dist(x_data[row,], obj[each,])
        }
        sort_order = order(distance)
        sorted_label = y_data[sort_order][1:k]
        sorted_distance = distance[sort_order][1:k]
        label[each] = mode(sorted_label,sorted_distance)        
    }
    return(label)
}

In [5]:
# data pre-prosessing - reading data, removing unnecessary cols, moving labels to last col
data <- read.csv("data.csv", na.strings = c('',"NA"))
data = data[,c(-1,-ncol(data))]
data = data[,c(2:ncol(data),1)]

In [6]:
# shuffling the data points and spliting in test and train data at random
shuf  = sample(2, nrow(data), replace = T, prob = c(0.8,0.2))
train = data[which(shuf == 1), ]
test  = data[which(shuf != 1), ]

x_test = test[ ,-length(train)]
y_test = test[ ,length(train)]

In [7]:
# loop to findout the performance matrix for different K values
k_set = seq(1,3,2)
k_accus = data.frame()
for(k in k_set){
    y_pred = knn(train, x_test, k)
    accus  = CalcAccus(y_pred, y_test, k)
    k_accus = rbind.data.frame(k_accus, accus)
}

In [8]:
k_accus

k,overall,positive,negative,MCC
1,0.9191919,0.9491525,0.875,0.8315855
3,0.9292929,0.9830508,0.85,0.8550736


In [9]:
table(y_test, y_pred)

      y_pred
y_test  B  M
     B 58  1
     M  6 34