# Algoritmul KNN cu optimizari

Mai jos avem 3 versiuni ale algoritmului de clasificare KNN. Prima foloseste un k cu o valoare "default".
Este eficient ca timp dar sufera din punct de vedere al acuratetii.
A doua implementare gaseste o valoare optima pentru k si creste acuratetea dar are un numar foarte mare de iteratii pentru k.
Ultima implementare gaseste o valoare optima pentru k mult mai eficient decat cea precedenta, cu un numar mult mai mic de iteratii.

In [1]:
# Import:

import numpy as np
import pandas as pd
import copy
from math import floor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Input and output file names:

# In:
trainingFile = "training.csv"
trainingLabelsFile = "trainingLabels.csv"
dataFile = "data.csv"
dataLabelsFile = "dataLabels.csv"
# Out:
outptDefault = "outptDefault.csv"
outptWithOptimizationFile = "outptWithOptimization.csv"
outptWithoutOptimizationFile = "outptWithoutOptimization.csv"
scoreFile = "score.txt"

In [3]:
# Define the default k value:

defaultK = 3

In [4]:
# Read/write:

df1 = pd.read_csv(trainingFile, index_col=None)
df2 = pd.read_csv(trainingLabelsFile, index_col=None)
data = pd.read_csv(dataFile, index_col=None)
dataLabels = pd.read_csv(dataLabelsFile, index_col=None)
fullTrainData = copy.deepcopy(df1)
fullTrainLabels = copy.deepcopy(df2)

# Split the data (testing is random 10% of data, training is the rest):
training, test, trainingLabels, testLabels = train_test_split(df1, df2, test_size=0.1, random_state=42)

# Normalize the data:
scale(fullTrainData)
scale(training)
scale(test)
scale(data)

def write_data(data, fileName):
    df = pd.DataFrame(data)
    df = df.replace(np.nan, '', regex=True)
    df.index += 1
    df.columns = ['Classification']
    df.to_csv(fileName, sep=',', index_label='Id')

In [5]:
# KNN functions:

# Return KNN predictions:
def KNN(XData, XLabels, YData, k):
    # Create KNN with given k:
    knn = KNeighborsClassifier(n_neighbors=k)

    # Fit the model:
    knn.fit(XData, XLabels)

    # Make predictions:
    pred = knn.predict(YData)
    return pred

# Score model accuracy:
def score_accuracy(pred, YLabels):
    # Evaluate the accuracy:
    sc = accuracy_score(YLabels, pred, normalize = True)
    return sc

# Score k using the test data on KNN:
def score_k(k):
    pred = KNN(training, trainingLabels, test, k)
    score = score_accuracy(pred, testLabels)
    return score

# Do KNN for the chosen k values and write the data:
def do_KNN(k, optimizedK):
    # Do KNN and score accuracy:
    defaultPred = KNN(fullTrainData, fullTrainLabels, data, defaultK)
    pred = KNN(fullTrainData, fullTrainLabels, data, k)
    predOptimized = KNN(fullTrainData, fullTrainLabels, data, optimizedK)
    defaultScore = score_accuracy(defaultPred, dataLabels) * 100
    score = score_accuracy(pred, dataLabels) * 100
    scoreOptimized = score_accuracy(predOptimized, dataLabels) * 100

    # Write data to output files:
    write_data(defaultPred, outptDefault)
    write_data(pred, outptWithoutOptimizationFile)
    write_data(predOptimized, outptWithOptimizationFile)

    # Write accuracy scores:
    text = ["Score for default k is: " + str(defaultScore) + "\n", "Score for normal parameter tuning is: " + str(score) + "\n", "Score for optimized parameter tuning is: " + str(scoreOptimized) + "\n"]
    fl = open(scoreFile,"w")
    fl.writelines(text)
    fl.close()
    print(text[0])
    print(text[1])
    print(text[2])

In [6]:
# The simple parameter tuning optimization:
def parameter_tuning():
    maxK = len(training.index)
    r = np.arange(1, maxK)
    maxScore = 0
    k = 1
    for i in r:
        s = score_k(i)
        if(s > maxScore):
            maxScore = s
            k = i
    return k

# The advanced parameter tuning optimization:
def optimized_parameter_tuning():
    start = 1
    end = len(training.index)
    while(start != end):
        middle = floor((start + end)/2)
        s1 = score_k(start)
        s2 = score_k(end)
        if(s1 > s2):
            end = middle
        else:
            start = middle
    return start

In [7]:
# Run 3 versions of KNN:
k = parameter_tuning()
optimizedK = optimized_parameter_tuning()

do_KNN(k, optimizedK)

Score for default k is: 57.70925110132159

Score for normal parameter tuning is: 62.55506607929515

Score for optimized parameter tuning is: 62.55506607929515

