In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('../Resources/weighted_samples_df.csv', index_col = False)

In [None]:
features = df.columns
print(features)

In [None]:
# drop unneeded columns
data = df.drop(['disease', 'sample_id'], axis=1)
data.head()

In [None]:
X = data
X = X.values
y = df['disease']
y = y.values

print("shape: ", X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, test_size=0.2, random_state=0)

In [None]:
# create standard scaler model to fit to training data
X_scaler = StandardScaler()
X_scaler.fit(X_train)

In [None]:
# transform training and testing data using X_scaler and y_scaler 
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
acc =  classifier.score(X_test, y_test)
print((acc)*100)

In [None]:
# loop through different k values to find the highest accuracy
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print((f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}"))
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.title('Testing and Training Scores')
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy score")
plt.show()

In [None]:
#Checking performance on the training set
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
#Checking performance on the test set
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))

In [None]:
y_true = y_test
y_pred = knn.predict(X_test_scaled)

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
from sklearn.metrics import f1_score

f1s = []

# Calculating f1 score for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    # using average='weighted' to calculate a weighted average 
    f1s.append(f1_score(y_test, pred_i, average='weighted'))

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), f1s, color='blue', linestyle='dashed', marker='o',
         markerfacecolor='orange', markersize=10)
plt.title('F1 Score K Value')
plt.xlabel('K Value')
plt.ylabel('F1 Score')

In [None]:
classifier10 = KNeighborsClassifier(n_neighbors=10)
classifier10.fit(X_train, y_train)
y_pred10 = classifier10.predict(X_test)
print(classification_report(y_test, y_pred10))