In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load the CSV file
file_path = '/content/filled Worksheet csv .csv'  # Adjust the path as needed
data = pd.read_csv(file_path)

# Drop non-numeric columns and separate features and target variable
# Replace problematic values and convert columns to numeric
def safe_convert_to_numeric(series):
    return pd.to_numeric(series, errors='coerce')  # Coerce invalid strings to NaN

X = data.drop(columns=['HEA combinations', 'Phases', 'Unnamed: 20'], errors='ignore')  # Drop specified columns
X = X.apply(safe_convert_to_numeric)  # Convert all columns to numeric
y = data['Phases']  # Target variable

# Relabel target values to sequential integers
y_mapped, unique_labels = pd.factorize(y)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Normalize the input data
X_imputed = X_imputed / np.max(X_imputed, axis=0)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_mapped, test_size=0.2, random_state=42)

# Define the KNN model with hyperparameter tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 10],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weighting methods
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metrics
}

knn_model = KNeighborsClassifier()
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_knn_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Make predictions on the test set
y_pred_knn = best_knn_model.predict(X_test)

# Calculate the confusion matrix
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_knn, annot=True, fmt='d', cmap='Blues',
            xticklabels=unique_labels, yticklabels=unique_labels)
plt.title('Confusion Matrix - KNN')
plt.xlabel('Predicted Phases')
plt.ylabel('True Phases')
plt.show()

# Print accuracy score
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'KNN Accuracy: {accuracy_knn:.2f}')

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn, target_names=unique_labels))

