# Iris Flower Classification

This notebook trains **KNN**, **Logistic Regression**, and **Naive Bayes** models on the Iris dataset and compares performance using accuracy, confusion matrix and ROC-AUC.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, roc_auc_score
from sklearn.metrics import roc_curve, auc

DATA_PATH = '../data/Iris.csv'
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Basic info
print('Shape:', df.shape)
print(df['Species'].value_counts())

df.isnull().sum()

In [None]:
# Prepare features/target
if 'Id' in df.columns:
    df = df.drop(columns=['Id'])

X = df.drop(columns=['Species'])
y = df['Species']

# As per assignment note: train 50% and test 50%
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y
)

print('Train size:', X_train.shape)
print('Test size :', X_test.shape)

In [None]:
# 1) KNN (with tuning)
knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsClassifier())
])

knn_params = {
    'model__n_neighbors': [3,5,7,9,11],
    'model__weights': ['uniform','distance']
}

knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, scoring='accuracy')
knn_grid.fit(X_train, y_train)

best_knn = knn_grid.best_estimator_
print('Best KNN Params:', knn_grid.best_params_)

In [None]:
# 2) Logistic Regression (with tuning)
lr_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=2000))
])

lr_params = {'model__C':[0.1,0.5,1,2,5,10]}

lr_grid = GridSearchCV(lr_pipe, lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train, y_train)

best_lr = lr_grid.best_estimator_
print('Best LR Params:', lr_grid.best_params_)

In [None]:
# 3) Naive Bayes
best_nb = Pipeline([('model', GaussianNB())])
best_nb.fit(X_train, y_train)

In [None]:
models = {
    'KNN (tuned)': best_knn,
    'Logistic Regression (tuned)': best_lr,
    'Naive Bayes': best_nb
}
models

In [None]:
# Evaluation helper

def evaluate_model(name, model):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print('\n' + '='*50)
    print(name)
    print('='*50)
    print('Accuracy:', round(acc, 4))
    print('\nClassification Report:\n', classification_report(y_test, y_pred))

    # Confusion Matrix
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='viridis')
    plt.title(f'{name} - Confusion Matrix')
    plt.show()

    # ROC-AUC (OvR multiclass)
    y_prob = model.predict_proba(X_test)
    classes = np.unique(y)
    y_test_bin = label_binarize(y_test, classes=classes)

    macro_auc = roc_auc_score(y_test_bin, y_prob, multi_class='ovr', average='macro')
    print('Macro ROC-AUC (OvR):', round(macro_auc, 4))

    # ROC Curve for each class
    plt.figure()
    for i, cls in enumerate(classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
        plt.plot(fpr, tpr, label=f'{cls} (AUC={auc(fpr,tpr):.2f})')

    plt.plot([0,1],[0,1],'k--',label='Chance')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{name} - ROC Curve (OvR)')
    plt.legend(fontsize=8)
    plt.show()

    return acc, macro_auc


In [None]:
results = []
for name, model in models.items():
    acc, macro_auc = evaluate_model(name, model)
    results.append([name, acc, macro_auc])

results_df = pd.DataFrame(results, columns=['Model','Accuracy','Macro ROC-AUC'])
results_df.sort_values('Macro ROC-AUC', ascending=False)

## Final Conclusion

- All models achieved high performance on the Iris dataset.
- **Logistic Regression (tuned)** achieved the best macro ROC-AUC.
- KNN and Naive Bayes performed similarly, showing the dataset is clean and separable.
