<a href="https://colab.research.google.com/github/Mmabatho/AI-For-Software-Engineeering-Week-3/blob/main/Task1_Iris_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Classical ML with Scikit-learn: Iris Species Classification
===========================================================

This script demonstrates a complete machine learning pipeline using the Iris dataset:
1. Data loading and exploration
2. Data preprocessing
3. Model training (Decision Tree)
4. Model evaluation with multiple metrics
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

print("=" * 60)
print("IRIS SPECIES CLASSIFICATION WITH DECISION TREE")
print("=" * 60)

# Step 1: Load the Iris Dataset
print("\n1. LOADING THE IRIS DATASET")
print("-" * 30)

# Load the iris dataset from scikit-learn
iris = load_iris()
X = iris.data  # Features: sepal length, sepal width, petal length, petal width
y = iris.target  # Target: species (0: setosa, 1: versicolor, 2: virginica)

# Create a DataFrame for easier manipulation
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = y
df['species_name'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print(f"Dataset shape: {df.shape}")
print(f"Features: {list(iris.feature_names)}")
print(f"Target classes: {list(iris.target_names)}")
print(f"\nFirst 5 rows:")
print(df.head())

# Step 2: Data Exploration
print("\n\n2. DATA EXPLORATION")
print("-" * 20)

print(f"Dataset info:")
print(f"- Total samples: {len(df)}")
print(f"- Features: {len(iris.feature_names)}")
print(f"- Classes: {len(iris.target_names)}")

print(f"\nClass distribution:")
print(df['species_name'].value_counts())

print(f"\nBasic statistics:")
print(df.describe())

# Check for missing values
print(f"\nMissing values per column:")
print(df.isnull().sum())

# Step 3: Data Preprocessing
print("\n\n3. DATA PREPROCESSING")
print("-" * 22)

# Check for missing values (Iris dataset is clean, but we'll demonstrate the process)
missing_values = df.isnull().sum().sum()
print(f"Total missing values: {missing_values}")

if missing_values > 0:
    print("Handling missing values...")
    # For numerical columns, we could use mean/median imputation
    # df.fillna(df.mean(), inplace=True)
else:
    print("✓ No missing values found - dataset is clean!")

# Label encoding (though not needed for iris as it's already encoded)
print("\nLabel encoding demonstration:")
le = LabelEncoder()
y_encoded = le.fit_transform(df['species_name'])
print(f"Original labels: {df['species_name'].unique()}")
print(f"Encoded labels: {np.unique(y_encoded)}")
print(f"Label mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# For this example, we'll use the original numeric targets
X_final = X  # Features
y_final = y  # Target (already encoded as 0, 1, 2)

print(f"\nFinal dataset shape:")
print(f"Features (X): {X_final.shape}")
print(f"Target (y): {y_final.shape}")

# Step 4: Train-Test Split
print("\n\n4. TRAIN-TEST SPLIT")
print("-" * 19)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final,
    test_size=0.2,
    random_state=42,
    stratify=y_final  # Ensure balanced split across classes
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"Training set class distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for i, (cls, count) in enumerate(zip(unique, counts)):
    print(f"  Class {cls} ({iris.target_names[cls]}): {count} samples")

# Step 5: Model Training
print("\n\n5. MODEL TRAINING")
print("-" * 16)

# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(
    random_state=42,
    max_depth=5,  # Prevent overfitting
    min_samples_split=2,
    min_samples_leaf=1
)

print("Training Decision Tree Classifier...")
print(f"Model parameters: {dt_classifier.get_params()}")

# Train the model
dt_classifier.fit(X_train, y_train)
print("✓ Model training completed!")

# Step 6: Model Prediction
print("\n\n6. MODEL PREDICTION")
print("-" * 17)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

print(f"Predictions made on {len(y_test)} test samples")
print(f"Sample predictions vs actual:")
for i in range(min(10, len(y_test))):
    pred_name = iris.target_names[y_pred[i]]
    actual_name = iris.target_names[y_test[i]]
    status = "✓" if y_pred[i] == y_test[i] else "✗"
    print(f"  {status} Predicted: {pred_name}, Actual: {actual_name}")

# Step 7: Model Evaluation
print("\n\n7. MODEL EVALUATION")
print("-" * 17)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Calculate precision and recall
# For multiclass classification, we need to specify the average method
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')

precision_weighted = precision_score(y_test, y_pred, average='weighted')
recall_weighted = recall_score(y_test, y_pred, average='weighted')

print(f"\nMacro-averaged metrics:")
print(f"Precision: {precision_macro:.4f}")
print(f"Recall: {recall_macro:.4f}")

print(f"\nWeighted-averaged metrics:")
print(f"Precision: {precision_weighted:.4f}")
print(f"Recall: {recall_weighted:.4f}")

# Detailed classification report
print(f"\nDetailed Classification Report:")
print("-" * 40)
report = classification_report(y_test, y_pred, target_names=iris.target_names)
print(report)

# Confusion Matrix
print(f"\nConfusion Matrix:")
print("-" * 16)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Create a more readable confusion matrix
cm_df = pd.DataFrame(cm,
                     index=[f'Actual {name}' for name in iris.target_names],
                     columns=[f'Pred {name}' for name in iris.target_names])
print(f"\nConfusion Matrix (with labels):")
print(cm_df)

# Step 8: Feature Importance
print("\n\n8. FEATURE IMPORTANCE")
print("-" * 19)

feature_importance = dt_classifier.feature_importances_
feature_names = iris.feature_names

print("Feature importance scores:")
for name, importance in zip(feature_names, feature_importance):
    print(f"  {name}: {importance:.4f}")

# Sort features by importance
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print(f"\nFeatures ranked by importance:")
for idx, row in importance_df.iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

# Step 9: Model Performance Summary
print("\n\n9. PERFORMANCE SUMMARY")
print("-" * 21)

print("🎯 FINAL RESULTS:")
print(f"   • Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"   • Precision (macro): {precision_macro:.4f}")
print(f"   • Recall (macro): {recall_macro:.4f}")
print(f"   • Most important feature: {importance_df.iloc[0]['feature']}")
print(f"   • Least important feature: {importance_df.iloc[-1]['feature']}")

# Calculate per-class metrics
print(f"\nPer-class performance:")
for i, class_name in enumerate(iris.target_names):
    class_mask = (y_test == i)
    if np.any(class_mask):
        class_accuracy = accuracy_score(y_test[class_mask], y_pred[class_mask])
        print(f"   • {class_name}: {class_accuracy:.4f} accuracy")

print("\n" + "=" * 60)
print("ANALYSIS COMPLETE!")
print("=" * 60)

# Additional insights
print(f"\n💡 KEY INSIGHTS:")
print(f"   • The Decision Tree achieved {accuracy*100:.1f}% accuracy on the test set")
print(f"   • {importance_df.iloc[0]['feature']} is the most discriminative feature")
print(f"   • The model shows {'excellent' if accuracy > 0.95 else 'good' if accuracy > 0.85 else 'moderate'} performance")
print(f"   • All classes are well-balanced in the dataset")

if accuracy == 1.0:
    print(f"   • Perfect classification achieved! This is expected for the Iris dataset.")
elif accuracy > 0.95:
    print(f"   • Near-perfect classification - excellent model performance!")
else:
    print(f"   • Good performance, but there's room for improvement with feature engineering or different algorithms.")


IRIS SPECIES CLASSIFICATION WITH DECISION TREE

1. LOADING THE IRIS DATASET
------------------------------
Dataset shape: (150, 6)
Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target classes: [np.str_('setosa'), np.str_('versicolor'), np.str_('virginica')]

First 5 rows:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species species_name  
0        0       setosa  
1        0       setosa  
2        0       setosa  
3        0       setosa  
4        0       setosa  


2. DATA EXPLORATION
--------------------
Datase