In [2]:
# ---
# importing required libraries
import random
import csv
import math
import statistics
import copy

# set random seed
random.seed('iris dataset')


In [10]:
def _load_csv(filename):
	with open(filename, 'r') as file:
		csv_reader = csv.reader(file)
		return [row for row in csv_reader if row]

In [11]:
csv.reader??

In [12]:
def _clean_features(dataset):
    num_columns = len(dataset[0])

    for row in dataset:
        for column in range(num_columns-1):
            row[column] = float(row[column].strip())

In [13]:
def _map_classes(dataset):
    class_mappings = {}
    for row in dataset:
        _specie = row[-1]
        if _specie not in class_mappings.keys():
            class_mappings[_specie] = len(class_mappings)
        row[-1] = class_mappings[_specie]

    return class_mappings

In [14]:
def _normalize_data(dataset):
    num_features = len(dataset[0])-1
    for i in range(num_features):
        column_values = [row[i] for row in dataset]
        column_min = min(column_values)
        column_max = max(column_values)
        
        for row in dataset:
            row[i] = (row[i] - column_min) / (column_max - column_min)

In [15]:
def DataLoader(filename):
    dataset = _load_csv(filename)
    _clean_features(dataset)
    class_mappings = _map_classes(dataset)
    _normalize_data(dataset)

    return dataset, class_mappings

In [17]:
#kNN Algorithm 

In [18]:
def _euclidean_distance(row1, row2):
    distance = 0.0
    num_features = len(row1)-1

    for i in range(num_features):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

In [19]:
def _get_k_neighbours(test_row, train_data, num_neighbours):
    test_train_distances = []
    for train_row in train_data:
        _test_train_distance = _euclidean_distance(test_row, train_row)
        test_train_distances.append([train_row, _test_train_distance])

    test_train_distances.sort(key=lambda idx: idx[1])
    return [test_train_distances[i][0] for i in range(num_neighbours)]

In [20]:
def _predict_classification(test_row, train_data, num_neighbours):
    nearest_neighbours =  _get_k_neighbours(test_row, train_data, num_neighbours)
    nearest_classes = [neighbour[-1] for neighbour in nearest_neighbours]
    predicted_class = max(set(nearest_classes), key=nearest_classes.count)

    return predicted_class

In [21]:
def kNN_Algorithm(test_data, train_data, num_neighbours):
    return [_predict_classification(test_row, train_data, num_neighbours) for test_row in test_data]

In [22]:
#Evaluate kNN Algorithm

def _test_train_split(dataset, test_ratio):
    _dataset = copy.deepcopy(dataset)
    random.shuffle(_dataset)

    split_index = int(len(dataset) * test_ratio)
    # Training data
    test_sample = _dataset[0:split_index]
    #Testing data
    train_sample = _dataset[split_index:]

    return test_sample, train_sample

In [23]:
def _cross_validation_split(dataset, num_groups):
    dataset_groups = []
    _dataset = copy.deepcopy(dataset)
    group_size = int(len(_dataset) / num_groups)

    for i in range(num_groups):
        group = []
        while len(group) < group_size:
            idx = random.randrange(len(_dataset))
            group.append(_dataset.pop(idx))
        dataset_groups.append(group)

    return dataset_groups

In [24]:
def _get_accuracy(test_sample, algorithm_predictions, class_mappings):
    test_classes = [row[-1] for row in test_sample]
    num_test_classes = len(test_classes)
    test_labels = list(class_mappings.keys())

    if len(test_classes) != len(algorithm_predictions):
        raise IndexError("The count of test classes is not equal to the count of algorithm predictions!")

    num_correct_predictions = sum([actual == predicted for actual, predicted 
                                                        in zip(test_classes, algorithm_predictions)])

    wrong_predictions = [f'A:{test_labels[actual]} | P:{test_labels[predicted]}'
                                                            for actual, predicted 
                                                            in zip(test_classes, algorithm_predictions)
                                                            if actual != predicted]
                        
    accuracy = (num_correct_predictions / num_test_classes) * 100
    return accuracy, wrong_predictions

In [25]:
def tts_Evaluate_kNN_Algorithm(dataset, class_mappings, test_ratio=0.25, 
                                                                num_neighbours=3, num_iterations=100):
    
    ACCURACY_HISTORY = []
    WRONG_PREDICTION_HISTORY = []

    for _iter in range(num_iterations):
        _dataset = copy.deepcopy(dataset)
        test_sample, train_sample = _test_train_split(_dataset, test_ratio)

        algorithm_predictions = kNN_Algorithm(test_sample, train_sample, num_neighbours)
        accuracy, wrong_predictions = _get_accuracy(test_sample, algorithm_predictions, class_mappings)
        ACCURACY_HISTORY.append(accuracy)
        WRONG_PREDICTION_HISTORY.extend(wrong_predictions)

    random.shuffle(WRONG_PREDICTION_HISTORY)
    print('kNN algorithm evaluation using the Test/Train Split method:', '\n\t', 
                'Average Accuracy:', round(statistics.mean(ACCURACY_HISTORY), ndigits=4), '\n\t', 
                'Maximum Accuracy:', max(ACCURACY_HISTORY), '\n')

    print('A: Actual | P: Predicted')
    print('\n'.join(WRONG_PREDICTION_HISTORY[:20]))

In [29]:
def cvs_Evaluate_kNN_Algorithm(dataset, class_mappings, num_groups=5, 
                                                                num_neighbours=3, num_iterations=100):
    
    ACCURACY_HISTORY = []
    WRONG_PREDICTION_HISTORY = []

    for _iter in range(num_iterations):
        _dataset = copy.deepcopy(dataset)
        dataset_groups = _cross_validation_split(_dataset, num_groups)

        for idx, group in enumerate(dataset_groups):
            test_sample = group
            _train_sample = copy.deepcopy(dataset_groups)
            del _train_sample[idx]
            
            train_sample = []
            for train_group in _train_sample:
                train_sample.extend(train_group)

            algorithm_predictions = kNN_Algorithm(test_sample, train_sample, num_neighbours)
            accuracy, wrong_predictions = _get_accuracy(test_sample, algorithm_predictions, class_mappings)
            ACCURACY_HISTORY.append(accuracy)
            WRONG_PREDICTION_HISTORY.extend(wrong_predictions)

    random.shuffle(WRONG_PREDICTION_HISTORY)
    print('kNN algorithm evaluation using the Cross Validation Split method:', '\n\t', 
                'Average Accuracy:', round(statistics.mean(ACCURACY_HISTORY), ndigits=4), '\n\t', 
                'Maximum Accuracy:', max(ACCURACY_HISTORY),'\n')

    print('A: Actual | P: Predicted')
    print('\n'.join(WRONG_PREDICTION_HISTORY[:20]))

In [30]:
#Evaluate kNN Algorithm: Using Test-Train Split Method
dataset, class_mappings = DataLoader(r"C:\Users\dofla\Documents\Python Scripts\datasets\iris.csv")
tts_Evaluate_kNN_Algorithm(dataset, class_mappings)

kNN algorithm evaluation using the Test/Train Split method: 
	 Average Accuracy: 0.027 
	 Maximum Accuracy: 2.7027027027027026 

A: Actual | P: Predicted
A:6.7;3.1;5.6;2.4;Iris-virginica | P:5.4;3.7;1.5;0.2;Iris-setosa
A:6.8;3.0;5.5;2.1;Iris-virginica | P:5.5;2.4;3.7;1.0;Iris-versicolor
A:7.2;3.2;6.0;1.8;Iris-virginica | P:6.3;2.7;4.9;1.8;Iris-virginica
A:7.2;3.2;6.0;1.8;Iris-virginica | P:5.0;3.6;1.4;0.2;Iris-setosa
A:5.8;4.0;1.2;0.2;Iris-setosa | P:5.7;2.8;4.1;1.3;Iris-versicolor
A:6.7;3.0;5.0;1.7;Iris-versicolor | P:6.7;3.1;4.4;1.4;Iris-versicolor
A:4.9;2.5;4.5;1.7;Iris-virginica | P:5.5;2.5;4.0;1.3;Iris-versicolor
A:6.2;3.4;5.4;2.3;Iris-virginica | P:6.1;3.0;4.6;1.4;Iris-versicolor
A:7.7;2.8;6.7;2.0;Iris-virginica | P:7.2;3.6;6.1;2.5;Iris-virginica
A:6.3;2.9;5.6;1.8;Iris-virginica | P:7.3;2.9;6.3;1.8;Iris-virginica
A:5.2;4.1;1.5;0.1;Iris-setosa | P:5.2;2.7;3.9;1.4;Iris-versicolor
A:5.4;3.9;1.3;0.4;Iris-setosa | P:4.9;3.1;1.5;0.1;Iris-setosa
A:5.5;2.5;4.0;1.3;Iris-versicolor | P:5.6

In [31]:
#Evaluate kNN Algorithm: Using Cross-Validation Split Method
dataset, class_mappings = DataLoader(r"C:\Users\dofla\Documents\Python Scripts\datasets\iris.csv")
cvs_Evaluate_kNN_Algorithm(dataset, class_mappings)

kNN algorithm evaluation using the Cross Validation Split method: 
	 Average Accuracy: 0.0467 
	 Maximum Accuracy: 6.666666666666667 

A: Actual | P: Predicted
A:5.7;2.8;4.1;1.3;Iris-versicolor | P:5.7;4.4;1.5;0.4;Iris-setosa
A:6.5;2.8;4.6;1.5;Iris-versicolor | P:6.1;3.0;4.6;1.4;Iris-versicolor
A:5.6;2.9;3.6;1.3;Iris-versicolor | P:6.1;3.0;4.6;1.4;Iris-versicolor
A:7.4;2.8;6.1;1.9;Iris-virginica | P:5.2;4.1;1.5;0.1;Iris-setosa
A:5.1;3.5;1.4;0.2;Iris-setosa | P:6.1;3.0;4.6;1.4;Iris-versicolor
A:5.1;3.7;1.5;0.4;Iris-setosa | P:5.4;3.4;1.5;0.4;Iris-setosa
A:6.0;2.2;4.0;1.0;Iris-versicolor | P:7.7;2.8;6.7;2.0;Iris-virginica
A:6.2;3.4;5.4;2.3;Iris-virginica | P:5.7;3.8;1.7;0.3;Iris-setosa
A:4.8;3.0;1.4;0.1;Iris-setosa | P:5.4;3.7;1.5;0.2;Iris-setosa
A:5.4;3.7;1.5;0.2;Iris-setosa | P:6.5;3.0;5.2;2.0;Iris-virginica
A:5.9;3.2;4.8;1.8;Iris-versicolor | P:6.4;2.9;4.3;1.3;Iris-versicolor
A:5.8;2.7;5.1;1.9;Iris-virginica | P:6.3;3.4;5.6;2.4;Iris-virginica
A:5.0;3.4;1.5;0.2;Iris-setosa | P:6.5;3.0;