In [None]:
import data_prep
import pandas as pd
import numpy as np

# Get the preprocessed dataset
df = data_prep.get_cleaned_dataset()

print(df.dtypes)

In [None]:
#####################
### FUNCTION DEFS ###
#####################

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

def prep_dataset(df, test_size=0.2):
    # Separate labels & classes
    X = df.drop('Class', axis=1).values     # Labels
    y = df['Class'].values                  # Classes

    X = OneHotEncoder().fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    return X_train, X_test, y_train, y_test

from sklearn.metrics import confusion_matrix # TODO confusion matrix

def basic_predict(X_train, X_test, y_train, y_test, classifier):
    """
    Runs basic prediction with 
    :returns double indicating performance
    NOTE: Random train/test split means performance is not consistent
    """
    classifier.fit(X_train, y_train)

    y_test_predict = classifier.predict(X_test)

    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")

    result = np.array([y_test_predict[ii] == y_test[ii] for ii in range(len(y_test))])

    performance = np.count_nonzero(result)/len(result)

    return performance

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Dataset
df = data_prep.get_cleaned_dataset()

X_train, X_test, y_train, y_test = prep_dataset(df, test_size=0.9966)

# K-Nearest Neighbours
knn = KNeighborsClassifier(n_neighbors=3)
knn_perf = basic_predict(X_train, X_test, y_train, y_test, knn)

# Decision Tree 
# TODO use something other than decision tree
dt = DecisionTreeClassifier()
dt_perf = basic_predict(X_train, X_test, y_train, y_test, dt)

# Naive Bayes
gnb = GaussianNB()
gnb_perf = basic_predict(X_train, X_test, y_train, y_test, dt)

print(f"knn performance: {knn_perf}")
print(f"dt performance: {dt_perf}")
print(f"gnb performance: {gnb_perf}")


In [None]:
###########
### OLD ###
###########

# Try some knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

train_test_ratio = 0.8

performance_arr = []

for n in range(1, 101):
    knn = KNeighborsClassifier(n_neighbors=1)

    last_train_row = int(len(df) * train_test_ratio)

    X = df.drop('Class', axis=1).values     # Labels
    y = df['Class'].values                  # Classes

    X = OneHotEncoder().fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    knn.fit(X_train, y_train)

    y_predict = knn.predict(X_test)

    result = np.array([y_predict[ii] == y_test[ii] for ii in range(len(y_test))])

    performance = np.count_nonzero(result)/len(result)

    performance_arr.append(performance)

best_n = performance_arr.index(max(performance_arr))
print(best_n)

print(performance_arr[best_n])