In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. DATA PREPARATION

In [53]:
# Load the iris dataset
iris = load_iris(as_frame=True)
df = iris.frame  # 150 rows, 5 columns: 4 features + target

print(f"Dataset shape: {df.shape}")
print(f"\nFeature columns: {iris.feature_names}")
print(f"Target classes: {iris.target_names.tolist()}")

# Check for missing values
print(f"\nMissing values per column:\n{df.isnull().sum()}")

# Feature ranges per class
print("\nFeature ranges by class:")
for i, name in enumerate(iris.target_names):
    class_data = df[df['target'] == i]
    print(f"{name}:")
    print(f"  sepal length: {class_data['sepal length (cm)'].min():.1f} - {class_data['sepal length (cm)'].max():.1f} cm")
    print(f"  sepal width:  {class_data['sepal width (cm)'].min():.1f} - {class_data['sepal width (cm)'].max():.1f} cm")
    print(f"  petal length: {class_data['petal length (cm)'].min():.1f} - {class_data['petal length (cm)'].max():.1f} cm")
    print(f"  petal width:  {class_data['petal width (cm)'].min():.1f} - {class_data['petal width (cm)'].max():.1f} cm")

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Class distribution:\n{y.value_counts().sort_index()}")

Dataset shape: (150, 5)

Feature columns: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target classes: ['setosa', 'versicolor', 'virginica']

Missing values per column:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

Feature ranges by class:
setosa:
  sepal length: 4.3 - 5.8 cm
  sepal width:  2.3 - 4.4 cm
  petal length: 1.0 - 1.9 cm
  petal width:  0.1 - 0.6 cm
versicolor:
  sepal length: 4.9 - 7.0 cm
  sepal width:  2.0 - 3.4 cm
  petal length: 3.0 - 5.1 cm
  petal width:  1.0 - 1.8 cm
virginica:
  sepal length: 4.9 - 7.9 cm
  sepal width:  2.2 - 3.8 cm
  petal length: 4.5 - 6.9 cm
  petal width:  1.4 - 2.5 cm

Features shape: (150, 4)
Target shape: (150,)
Class distribution:
target
0    50
1    50
2    50
Name: count, dtype: int64


# 2. TRAIN/TEST SPLIT

In [54]:
# Split data: 80% training, 20% testing
# Using stratified split to maintain class distribution in both sets

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set size: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set size: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\nTraining class distribution:\n{y_train.value_counts().sort_index()}")
print(f"\nTest class distribution:\n{y_test.value_counts().sort_index()}")


Training set size: 120 samples (80.0%)
Test set size: 30 samples (20.0%)

Training class distribution:
target
0    40
1    40
2    40
Name: count, dtype: int64

Test class distribution:
target
0    10
1    10
2    10
Name: count, dtype: int64


# 3. MODEL TRAINING

In [55]:
# Logistic Regression
print("\nTraining Logistic Regression model...")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
print("Logistic Regression training complete.")

# Decision Tree Classifier
print("\nTraining Decision Tree model...")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
print("Decision Tree training complete.")


Training Logistic Regression model...
Logistic Regression training complete.

Training Decision Tree model...
Decision Tree training complete.


# 4. MODEL EVALUATION

In [56]:
# Predictions
lr_predictions = lr_model.predict(X_test)
dt_predictions = dt_model.predict(X_test)

# --- Accuracy ---
print("\n--- a. ACCURACY ---")
lr_accuracy = accuracy_score(y_test, lr_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)

print(f"Logistic Regression Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")
print(f"Decision Tree Accuracy:       {dt_accuracy:.4f} ({dt_accuracy*100:.2f}%)")

# --- Classification Reports ---
target_names = iris.target_names.tolist()
print("\n--- b. CLASSIFICATION REPORTS ---")

print("\n>> Logistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions, target_names=target_names))

print("\n>> Decision Tree Classification Report:")
print(classification_report(y_test, dt_predictions, target_names=target_names))

# --- Confusion Matrices ---
print("\n--- c. CONFUSION MATRICES ---")

lr_cm = confusion_matrix(y_test, lr_predictions)
dt_cm = confusion_matrix(y_test, dt_predictions)

print("\nLogistic Regression Confusion Matrix:")
print(lr_cm)

print("\nDecision Tree Confusion Matrix:")
print(dt_cm)

# Create and save confusion matrix heatmaps
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Logistic Regression confusion matrix
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names,
            ax=axes[0], cbar=True)
axes[0].set_title('Logistic Regression\nConfusion Matrix', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Predicted Label', fontsize=10)
axes[0].set_ylabel('True Label', fontsize=10)

# Decision Tree confusion matrix
sns.heatmap(dt_cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=target_names, yticklabels=target_names,
            ax=axes[1], cbar=True)
axes[1].set_title('Decision Tree\nConfusion Matrix', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Predicted Label', fontsize=10)
axes[1].set_ylabel('True Label', fontsize=10)

plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.close()

print("\nConfusion matrix heatmap saved as 'confusion_matrix.png'")


--- a. ACCURACY ---
Logistic Regression Accuracy: 0.9667 (96.67%)
Decision Tree Accuracy:       0.9333 (93.33%)

--- b. CLASSIFICATION REPORTS ---

>> Logistic Regression Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30


>> Decision Tree Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      0.90      0.90        10
   virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30


-