In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

# Load the dataset
file_path = 'DB_Voice_Features.csv'
data = pd.read_csv(file_path)

# Separate features and labels
X = data.drop(['name', 'status'], axis=1)
y = data['status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features for k-NN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Part (a): Apply k-NN without optimization
knn = KNeighborsClassifier(n_neighbors=5)  # Starting with k=5
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)

# Evaluate performance
print("Performance before optimization:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Part (b): Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': range(1, 31)}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Best k value after optimization
print(f"Best k value: {grid_search.best_params_['n_neighbors']}")

# Re-train k-NN with the best parameter
knn_optimized = grid_search.best_estimator_
y_pred_optimized = knn_optimized.predict(X_test_scaled)

# Part (c): Evaluate performance after optimization
print("Performance after optimization:")
print(confusion_matrix(y_test, y_pred_optimized))
print(classification_report(y_test, y_pred_optimized))


Performance before optimization:
[[ 5  2]
 [ 0 32]]
              precision    recall  f1-score   support

           0       1.00      0.71      0.83         7
           1       0.94      1.00      0.97        32

    accuracy                           0.95        39
   macro avg       0.97      0.86      0.90        39
weighted avg       0.95      0.95      0.95        39

Best k value: 1
Performance after optimization:
[[ 7  0]
 [ 1 31]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       1.00      0.97      0.98        32

    accuracy                           0.97        39
   macro avg       0.94      0.98      0.96        39
weighted avg       0.98      0.97      0.98        39

