In [21]:
# Import libraries
import os 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold as sk
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn.metrics import accuracy_score as acs
from sklearn.tree import export_graphviz
from matplotlib.colors import ListedColormap
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier


df = pd.read_csv('cleaneddata.csv')

# Convert to data frame
X = df.drop('Condition', axis=1)  # Features (dropping the target 'Condition')
y = df['Condition']               # Target column

# Get feature names from DataFrame columns
feature_names = X.columns.tolist()

# Define class names dynamically based on unique values in 'Condition'
class_names = y.unique().astype(str).tolist()  # Automatically get class names from target

# Initialize the model
model = DecisionTreeClassifier(random_state=42)

# Loop to find the best k value
best_k = None
best_score = -np.inf
results = []

# Checking k values from 2 to 20
for k in range(2, 21):
    scores = cross_val_score(model, X, y, cv=k)
    avg_score = np.mean(scores)
    std_score = np.std(scores)
    results.append((k, avg_score, std_score))
    
    # Update best k if current k has a higher average score
    if avg_score > best_score:
        best_score = avg_score
        best_k = k

# Display the best k value and score
print(f"The optimal k-value is {best_k} with an average score of {best_score:.4f}.")

# Convert results to a DataFrame for a clear summary
results_df = pd.DataFrame(results, columns=["k", "Average Score", "Standard Deviation"])
print("\nSummary of scores for each k-value:")
print(results_df)

The optimal k-value is 3 with an average score of 0.8882.

Summary of scores for each k-value:
     k  Average Score  Standard Deviation
0    2       0.853448            0.043103
1    3       0.888214            0.031265
2    4       0.844828            0.071088
3    5       0.803261            0.091227
4    6       0.864035            0.127712
5    7       0.846113            0.074584
6    8       0.871429            0.071943
7    9       0.878917            0.053755
8   10       0.862879            0.054361
9   11       0.870248            0.083688
10  12       0.870370            0.076084
11  13       0.869658            0.076566
12  14       0.864087            0.072584
13  15       0.863095            0.071627
14  16       0.872768            0.087225
15  17       0.845938            0.114658
16  18       0.865079            0.117181
17  19       0.845865            0.123445
18  20       0.846667            0.139204
