In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Load the Dataset
file_path = 'Diabetes-dataset.csv'

try:
    # Load the dataset and display the first few rows
    df = pd.read_csv(file_path)
    print("Dataset Loaded Successfully!")
    print("First 5 Rows of the Dataset:")
    print(df.head())
except FileNotFoundError:
    print("Error: The file 'diabetes-dataset.csv' was not found. Please check the file path.")
    exit()

# Step 3: Data Preprocessing
# Check for missing values
print("\nMissing Values in Each Column:")
print(df.isnull().sum())

# Drop rows with any missing values (optional: can replace with imputation if necessary)
df = df.dropna()
print("\nAfter Dropping Missing Values, Dataset Shape:", df.shape)

# Check and clean column names to ensure consistency
print("\nDataset Columns:", df.columns)

# Rename columns if inconsistent
df.columns = df.columns.str.strip().str.upper()  # Ensure uniform column names
expected_columns = {'ID', 'NO_PATION', 'GENDER', 'AGE', 'UREA', 'CR', 'HBA1C', 
                    'CHOL', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI', 'CLASS'}

if not expected_columns.issubset(set(df.columns)):
    print("Error: Input file has inconsistent column names. Exiting...")
    exit()

# Drop irrelevant columns like 'ID' and 'NO_PATION'
df = df.drop(columns=['ID', 'NO_PATION'])

# Encode categorical variables
label_encoder = LabelEncoder()
df['GENDER'] = label_encoder.fit_transform(df['GENDER'])  # Male -> 1, Female -> 0

# Verify class distribution
print("\nClass Distribution:")
print(df['CLASS'].value_counts())

# Handle classes with less than 2 samples
class_counts = df['CLASS'].value_counts()
if any(class_counts < 2):
    print("Warning: Dropping classes with fewer than 2 samples.")
    df = df[df['CLASS'].map(class_counts) > 1]

# Define features (X) and target (y)
X = df.drop(columns=['CLASS'])
y = df['CLASS']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Split the Data
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
except ValueError:
    print("\nStratification failed due to insufficient class samples. Splitting without stratification.")
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("\nData Split Successfully!")
print(f"Training Samples: {X_train.shape[0]}, Testing Samples: {X_test.shape[0]}")

# Step 5: Train the KNN Model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 7: Predict for New User Input
print("\n--- Diabetes Detection Prediction ---")
print("Enter the following details to predict diabetes:")
print("\nInstructions:")
print("For Gender, enter '1' for Male and '0' for Female.")

# Collect user input dynamically
user_input = {}

try:
    # Input with clear instructions
    user_input['GENDER'] = int(input("Gender (1 = Male, 0 = Female): "))
    user_input['AGE'] = float(input("Age: "))
    user_input['UREA'] = float(input("Urea Level: "))
    user_input['CR'] = float(input("Creatinine Level: "))
    user_input['HBA1C'] = float(input("HbA1c Level: "))
    user_input['CHOL'] = float(input("Cholesterol Level: "))
    user_input['TG'] = float(input("Triglycerides Level: "))
    user_input['HDL'] = float(input("HDL Level: "))
    user_input['LDL'] = float(input("LDL Level: "))
    user_input['VLDL'] = float(input("VLDL Level: "))
    user_input['BMI'] = float(input("BMI: "))

    # Convert input into DataFrame
    user_data = pd.DataFrame([user_input])
    
    # Scale input features
    user_data_scaled = scaler.transform(user_data)

    # Predict
    prediction = knn.predict(user_data_scaled)

    # Output result
    if prediction[0] == 1:
        print("\nPrediction: The patient is likely to have diabetes (Positive Diagnosis).")
    else:
        print("\nPrediction: The patient is NOT likely to have diabetes (Negative Diagnosis).")

except Exception as e:
    print("Error in input. Please ensure correct numeric values are entered.")
    print("Details:", e)
