In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Load dataset
df = pd.read_csv("diabetes.csv")  # Change path if needed

# Separate features and target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scaling for models that need it
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# Binarized for BernoulliNB
X_train_bin = (X_train_scaled > 0.5).astype(int)
X_test_bin = (X_test_scaled > 0.5).astype(int)

# Categorical for CategoricalNB
X_train_cat = X_train_scaled.apply(lambda col: pd.qcut(col, q=4, labels=False, duplicates="drop"))
X_test_cat = X_test_scaled.apply(lambda col: pd.qcut(col, q=4, labels=False, duplicates="drop"))

# Classifiers to compare
models = {
    "GaussianNB": (GaussianNB(), X_train, X_test),
    "MultinomialNB": (MultinomialNB(), X_train_scaled, X_test_scaled),
    "BernoulliNB": (BernoulliNB(), X_train_bin, X_test_bin),
    "ComplementNB": (ComplementNB(), X_train_scaled, X_test_scaled),
    "CategoricalNB": (CategoricalNB(), X_train_cat, X_test_cat),
    "LogisticRegression": (LogisticRegression(max_iter=1000), X_train_scaled, X_test_scaled),
    "DecisionTree": (DecisionTreeClassifier(random_state=42), X_train, X_test),
    "RandomForest": (RandomForestClassifier(random_state=42), X_train, X_test),
    "KNN": (KNeighborsClassifier(), X_train_scaled, X_test_scaled),
    "SVM": (SVC(), X_train_scaled, X_test_scaled)
}

# Store metrics
results = []

for name, (model, Xtr, Xte) in models.items():
    model.fit(Xtr, y_train)
    preds = model.predict(Xte)
    acc = accuracy_score(y_test, preds) * 100
    prec = precision_score(y_test, preds) * 100
    rec = recall_score(y_test, preds) * 100
    f1 = f1_score(y_test, preds) * 100
    results.append([name, acc, prec, rec, f1])

# Convert results to DataFrame
df_results = pd.DataFrame(results, columns=["Model", "Accuracy (%)", "Precision (%)", "Recall (%)", "F1-Score (%)"])

# Round to 2 decimal places
df_results = df_results.round(2)

# Sort by Accuracy
df_results = df_results.sort_values(by="Accuracy (%)", ascending=False)

print(df_results)

# Save comparison table
df_results.to_csv("classifier_comparison_percentage.csv", index=False)
print("\nComparison saved to 'classifier_comparison_percentage.csv'")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                Model  Accuracy (%)  Precision (%)  Recall (%)  F1-Score (%)
0          GaussianNB         76.62          66.10       70.91         68.42
5  LogisticRegression         75.32          68.09       58.18         62.75
9                 SVM         75.32          68.89       56.36         62.00
6        DecisionTree         74.68          62.50       72.73         67.23
4       CategoricalNB         73.38          63.46       60.00         61.68
7        RandomForest         72.08          60.71       61.82         61.26
8                 KNN         68.18          55.77       52.73         54.21
2         BernoulliNB         64.94          50.85       54.55         52.63
1       MultinomialNB         64.29           0.00        0.00          0.00
3        ComplementNB         59.09          44.29       56.36         49.60

Comparison saved to 'classifier_comparison_percentage.csv'


In [None]:
# Comparison table and graph for all classifiers (custom metrics)
import numpy as np

def custom_accuracy(y_true, y_pred):
    # Accuracy = (TP + TN) / ALL
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return (tp + tn) / (tp + tn + fp + fn)

def custom_precision(y_true, y_pred):
    # Precision = TP / (TP + FP)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    return tp / (tp + fp) if (tp + fp) > 0 else 0

def custom_recall(y_true, y_pred):
    # Recall = TP / (TP + FN)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tp / (tp + fn) if (tp + fn) > 0 else 0

def custom_f1(y_true, y_pred):
    # F1 Score = 2 * (Precision * Recall) / (Precision + Recall)
    prec = custom_precision(y_true, y_pred)
    rec = custom_recall(y_true, y_pred)
    return 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0

models = [
    ("KNN", knn_preds),
    ("Naive Bayes", nb_preds),
    ("MultinomialNB", multinb_preds),
    ("BernoulliNB", bernoullinb_preds)
]

results = []
for name, preds in models:
    acc = custom_accuracy(y_test.values, preds)
    prec = custom_precision(y_test.values, preds)
    rec = custom_recall(y_test.values, preds)
    f1 = custom_f1(y_test.values, preds)
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1
    })

results_df = pd.DataFrame(results)
display(results_df)

# Bar plot for comparison
metrics = ["Accuracy", "Precision", "Recall", "F1 Score"]
results_df.set_index("Model")[metrics].plot(kind="bar", figsize=(10,6))
plt.title("Classifier Performance Comparison (Custom Metrics)")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.xticks(rotation=0)
plt.legend(loc="lower right")
plt.show()

NameError: name 'knn_preds' is not defined