In [30]:
!pip install pandas
!pip install scikit-learn

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, Birch
from pandas.plotting import table



In [10]:
data = pd.read_csv('project_files/openML_banknote_authentication_dataset.csv')
data.head()

Unnamed: 0,id,V1,V2,V3,V4,Class
0,1,3.6216,8.6661,-2.8073,-0.44699,1
1,2,4.5459,8.1674,-2.4586,-1.4621,1
2,3,3.866,-2.6383,1.9242,0.10645,1
3,4,3.4566,9.5228,-4.0112,-3.5944,1
4,5,0.32924,-4.4552,4.5718,-0.9888,1


In [16]:
boxplot_image = plt.figure()
boxplot = data.boxplot(column=['V1', 'V2'])
plt.title("V1 and V2 boxplot")
boxplot_image.savefig("images/boxplot_image.png")
plt.close()

### This creates a boxplot to detect any outliers from the dataset from the two features V1 and V2

### Create an image to save the description of the simplified table

In [66]:
# normalise the data
normed_data = (data - data.min())/(data.max() - data.min())
normed_data.head()
# actually correct classing
label_name = ['genuine', 'fake']

for index, label in enumerate(label_name):
    temp = normed_data[ normed_data['Class'] == index ]
    plt.scatter(temp['V1'], temp['V2'], label=label, alpha=0.4)
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title('Banknote authentication correct labels')
plt.savefig("images/banknote_authentication_correct_labels.png")
plt.close()

### Map points from the dataset with labels of the genuine and fake banknotes

In [69]:
kmeans_res = KMeans(n_clusters=2)
cluster_groups = kmeans_res.fit_predict(normed_data[['V1', 'V2']])
normed_data['Clusters'] = cluster_groups
inaccurate = normed_data[ normed_data['Class'] != normed_data['Clusters'] ]
accuracy = (len(normed_data) - len(inaccurate))/(len(normed_data))*100
print(f'accuracy: {accuracy}')

accuracy: 87.09912536443149


In [135]:
# plot Kmeans
label_name_pred = ['predicted genuine', 'predicted fake']
for index, label in enumerate(label_name_pred):
    temp = normed_data[ normed_data['Clusters'] == index ]
    plt.scatter(temp['V1'], temp['V2'], label=label, alpha=0.4)
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title(f'Prediction of K-means algorithm')

plt.tight_layout()
plt.savefig("images/Kmeans_prediction.png")
plt.close()

In [136]:
# Plot the inaccuracies of kmeans clustering
inaccurate_kmean = normed_data[ normed_data['Class'] != normed_data['Clusters'] ]
plt.scatter(normed_data['V1'], normed_data['V2'],
            alpha=0.4, c=normed_data['Class'], cmap="coolwarm",
           label=None)

plt.scatter(inaccurate_kmean['V1'], inaccurate_kmean['V2'],
            c='c', alpha=0.7, label="inaccuracies")
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title(f'K-means comparison with actual classification, accuracy: {round(accuracy,2)}%')

plt.savefig("images/Kmeans_inaccuracies_over_accurate.png")
plt.close()

In [74]:
inaccurate_kmean = normed_data[ normed_data['Class'] != normed_data['Clusters'] ]
plt.scatter(normed_data['V1'], normed_data['V2'],
            alpha=0, c=normed_data['Class'], cmap="coolwarm",
           label=None)

plt.scatter(inaccurate_kmean['V1'], inaccurate_kmean['V2'],
            c='c', alpha=0.7, label="inaccuracies")
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title('K-means inaccuracies')

plt.savefig("images/Kmeans_inaccuracies.png")
plt.close()

In [79]:
curse_of_dimensionality_res = KMeans(n_clusters=2)
cluster_groups_curse = curse_of_dimensionality_res.fit_predict(normed_data[['V1', 'V2','V3','V4']])
normed_data['Clusters_curse'] = cluster_groups_curse
inaccurate_curse = normed_data[ normed_data['Class'] != normed_data['Clusters_curse'] ]
accuracy_curse = (len(normed_data) - len(inaccurate_curse))/(len(normed_data))*100
print(f'accuracy: {accuracy_curse}')

accuracy: 57.5801749271137


In [138]:
# plot Kmeans with more features
for index, label in enumerate(label_name_pred):
    temp = normed_data[ normed_data['Clusters_curse'] == index ]
    plt.scatter(temp['V1'], temp['V2'], label=label, alpha=0.4)
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title(f'Prediction of K-means algorithm with 4 features')

plt.tight_layout()
plt.savefig("images/Kmeans_curse_of_dimensionality.png")
plt.close()

In [140]:
inaccurate_kmean = normed_data[ normed_data['Class'] != normed_data['Clusters_curse'] ]
plt.scatter(normed_data['V1'], normed_data['V2'],
            alpha=0.4, c=normed_data['Class'], cmap="coolwarm",
           label=None)

plt.scatter(inaccurate_kmean['V1'], inaccurate_kmean['V2'],
            c='c', alpha=0.7, label="inaccuracies")
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title(f'K-means with 4 features comparison over actual classifications, accuracy: {round(accuracy_curse,2)}%')

plt.savefig("images/Kmeans_curse_inaccuracies_comparison.png")
plt.close()

In [110]:
plt.scatter(normed_data['V1'], normed_data['V2'],
            alpha=0, c=normed_data['Class'], cmap="coolwarm",
           label=None)

plt.scatter(inaccurate_kmean['V1'], inaccurate_kmean['V2'],
            c='c', alpha=0.7, label="inaccuracies")
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title('K-means with 4 features inaccuracies')

plt.savefig("images/Kmeans_curse_inaccuracies.png")
plt.close()

In [115]:
# KNearest Neighbour
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# create a dataframe with all training data except the target column
X = normed_data.drop(columns=['Class', 'Clusters', 'Clusters_curse', 'V3','V4','id'])
X.head()

Unnamed: 0,V1,V2
0,0.769004,0.839643
1,0.835659,0.820982
2,0.786629,0.416648
3,0.757105,0.871699
4,0.531578,0.348662


In [116]:
# insert the training column into another variable
y = normed_data['Class'].values
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                   random_state=1,
                                                   stratify=y)

In [117]:
knn = KNeighborsClassifier(n_neighbors = 2)

#fit the classfier to the data
knn.fit(X_train, y_train)

#show first 5 model predictions on the test data
knn_res = knn.predict(X_test)
# check accuracy of the model on the test data
knn.score(X_test, y_test)

0.9236363636363636

split the dataset into training and testing data
- the training data is the data the model will learn from
- the testing data will be used to see how well the model performs on unseen data

test_size=0.2
- this means that 20% of all data will be used for testing
- the rest 80% will be used for training data for the model to learn from

random_state=1
- ensure will get the same split each time so we can reproduce our results

stratify=y
- make sure the training split represent the proportion of each value in the y variable
- if in the datasest 25% genuine and 75% fake: 
    - stratify = y will insure that the random split has 25% genuine and 75% fake

neighbors = 3: 
- if 2/3 are fake, then new data point will be labeled as genuine.
the new data point is labeled with by majority from the 3 nearest points

In [141]:
dropped_col = normed_data.drop(columns=['Class', 'Clusters', 'Clusters_curse','V3','V4','id'])
test = normed_data.copy()
test['KNN'] = knn.predict(dropped_col)

for index, label in enumerate(label_name_pred):
    temp = test[ test['KNN'] == index ]
    plt.scatter(temp['V1'], temp['V2'], label=label, alpha=0.4)

    
knn_accuracy = knn.score(X_test, y_test)*100
    
plt.legend()
plt.xlabel('V1')
plt.ylabel('V2')
plt.title(f'Prediction of KNN')
plt.savefig("images/KNN_prediction.png")

plt.tight_layout()
plt.close()

In [142]:
inaccurate_knn = test[ test['Class'] != test['KNN'] ]
plt.scatter(normed_data['V1'], normed_data['V2'],
            alpha=0.4, c=normed_data['Class'], cmap="coolwarm",
           label=None)

plt.scatter(inaccurate_knn['V1'], inaccurate_knn['V2'],
            c='c', alpha=0.7, label="inaccuracies")
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title(f'KNN comparison with actual classification, accuracy: {round(knn_accuracy,2)}%')
plt.savefig("images/KNN_inaccuracies_comparison.png")
plt.close()

In [125]:
plt.scatter(normed_data['V1'], normed_data['V2'],
            alpha=0, c=normed_data['Class'], cmap="coolwarm",
           label=None)

plt.scatter(inaccurate_knn['V1'], inaccurate_knn['V2'],
            c='c', alpha=0.7, label="inaccuracies")
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title('KNN inaccuracies')
plt.savefig("images/KNN_inaccuracies.png")
plt.close()

In [151]:
# KNN with more features
# create a dataframe with all training data except the target column
X_more = normed_data.drop(columns=['Class', 'Clusters', 'Clusters_curse','id'])
# insert the training column into another variable
y_more = normed_data['Class'].values
# split dataset into train and test data
X_more_train, X_more_test, y_more_train, y_more_test = train_test_split(X_more, y_more, 
                                                    test_size=0.2,
                                                   random_state=1,
                                                   stratify=y_more)
knn_more = KNeighborsClassifier(n_neighbors = 2)

#fit the classfier to the data
knn_more.fit(X_more_train, y_more_train)

#show first 5 model predictions on the test data
knn_more_res = knn_more.predict(X_more_test)
# check accuracy of the model on the test data
knn_more.score(X_more_test, y_more_test)

1.0

In [153]:
dropped_col_less = normed_data.drop(columns=['Class', 'Clusters', 'Clusters_curse','id'])
test['KNN_more'] = knn_more.predict(dropped_col_less)

for index, label in enumerate(label_name_pred):
    temp = test[ test['KNN_more'] == index ]
    plt.scatter(temp['V1'], temp['V2'], label=label, alpha=0.4)

    
knn_more_accuracy = knn_more.score(X_more_test, y_more_test)*100
    
plt.legend()
plt.xlabel('V1')
plt.ylabel('V2')
plt.title(f'Prediction of KNN with 4 features')
plt.tight_layout()
plt.savefig('images/KNN_4_features_prediction.png')
plt.close()

In [154]:
inaccurate_knn_more = test[ test['Class'] != test['KNN_more'] ]
plt.scatter(normed_data['V1'], normed_data['V2'],
            alpha=0.4, c=normed_data['Class'], cmap="coolwarm",
           label=None)

plt.scatter(inaccurate_knn_more['V1'], inaccurate_knn_more['V2'],
            c='c', alpha=0.7, label="inaccuracies")
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title(f'KNN 4 features comparison with actual classification, accuracy: {round(knn_more_accuracy,2)}%')
plt.savefig('images/KNN_4_features_inaccuracies_comparison.png')
plt.close()

In [155]:
plt.scatter(normed_data['V1'], normed_data['V2'],
            alpha=0, c=normed_data['Class'], cmap="coolwarm",
           label=None)

plt.scatter(inaccurate_knn_more['V1'], inaccurate_knn_more['V2'],
            c='c', alpha=0.7, label="inaccuracies")
    
plt.legend(loc="lower right")
plt.xlabel('V1')
plt.ylabel('V2')
plt.title('KNN 4 features inaccuracies')
plt.savefig('images/KNN_4_features_inaccuracies.png')
plt.close()