In [1]:
import numpy as np

training_spam = np.loadtxt(open("data/training_spam.csv"), delimiter=",")
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",")

def format_training_data():
    
    #  Method to return a list of values taken from the first column (labels) and a list of all values from the second
    #  column to the end of the training data list (features).

    training_labels = training_spam[:, 0].astype(int)
    training_features = training_spam[:, 1:].astype(int)

    return training_labels, training_features


class SpamClassifier:
    def __init__(self, k):
        self.k = k

    def _make_prediction(self, email_features):

        #  Finds the Euclidean distance from the query point to each data point in training_features to find the closest
        #  neighbour.
        
        training_labels, training_features = format_training_data()

        distance = np.zeros(training_features.shape[0])  # Return a new array of zeros given the shape of training_features

        #  Loop over every feature in training_features, updating the distance point where the query point matches
        for i, query_point in enumerate(training_features):
            total = 0
            for dimension in range(0, query_point.shape[0]):
                total += (query_point[dimension] - email_features[dimension]) ** 2
            distance[i] = np.sqrt(total)

        #  Find the data points with the shortest distances and sort to find matching labels
        sorted_points = np.argsort(distance)
        k_closest_points = sorted_points[:self.k]
        k_closest_labels = training_labels[k_closest_points]

        #  Find and return the most common labels and determine whether spam or ham
        votes_for_spam = np.count_nonzero(k_closest_labels == 1)

        if votes_for_spam > self.k / 2:
            return 1
        else:
            return 0

    def predict(self, test_data):
        
        predictions = []
        
        #  Loop over each email classification in test_data appending prediction calculated in _make_prediction
        for i in range(0, test_data.shape[0]):
            test_input = test_data[i, :]
            predictions.append(self._make_prediction(test_input))
        return predictions


def create_classifier():
    return SpamClassifier(k=3)


classifier = create_classifier()


In [2]:

testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)
test_data = testing_spam[:, 1:]
test_labels = testing_spam[:, 0]

predictions = classifier.predict(test_data)
accuracy = np.count_nonzero(predictions == test_labels)/test_labels.shape[0]
print(f"Accuracy on test data is: {accuracy}")

Accuracy on test data is: 0.9
