In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
#read file in
knn_data = pd.read_csv('knn-training-data.txt', sep='\t')

In [3]:
#change string values to numerical values
label_encoder = LabelEncoder()
knn_data['most_popular_category'] = label_encoder.fit_transform(knn_data['most_popular_category'])

In [5]:
#Split into testing and training

x = knn_data.iloc[:,1:4].values
y = knn_data.iloc[:, 5].values

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [43]:
#normalize data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
#train model
classifier = KNeighborsClassifier(n_neighbors=12)
classifier.fit(x_train, y_train)

In [9]:
#Use model to predict
y_pred = classifier.predict(x_test)

In [10]:
#Evaluate model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

      cheapo       0.97      0.98      0.98       790
    explorer       0.96      0.97      0.96       489
  loyal core       0.99      0.96      0.97       897
      newbie       0.99      0.99      0.99      1044

    accuracy                           0.98      3220
   macro avg       0.97      0.98      0.97      3220
weighted avg       0.98      0.98      0.98      3220

[[ 778    1    8    3]
 [   0  472    5   12]
 [  22   15  860    0]
 [   3    6    0 1035]]


In [11]:
#read in test data
knn_data_test = pd.read_csv('knn-testing-data.txt', sep='\t')

In [13]:
#change string values to numerical values
label_encoder = LabelEncoder()
knn_data_test['most_popular_category'] = label_encoder.fit_transform(knn_data_test['most_popular_category'])

In [15]:
#build test data set
x_new = knn_data_test.iloc[:,1:4].values


In [22]:
#normalize data
scaler.fit(x_new)
x_new = scaler.transform(x_new)

In [23]:
#Run model on test data
knn_pred = classifier.predict(x_new)

In [44]:
#add predicted segment to test data
knn_data_test['predicted_segment'] = knn_pred

In [28]:
#Read in actual values
true_values = pd.read_csv('testing-true-values.txt', sep='\t')

In [35]:
#compare predicted values to actual values
comparison = knn_data_test['predicted_segment'] == (true_values['segment'])

In [36]:
comparison.value_counts()

True     1725
False      64
Name: count, dtype: int64

In [42]:
#calculate accuracy
correct = comparison.sum()
total = len(comparison)
accuracy = correct/total
accuracy

0.9642258244829514