# 🌸 Iris Flower Classifier

**Project**: Classification of Iris flower species  
**Level**: Beginner  
**Dataset**: Iris Dataset (Scikit-learn built-in)  

## 📋 Project Overview

In this project, we'll build a machine learning classifier to predict the species of iris flowers based on their physical characteristics. This is a classic beginner project that covers:

- Data loading and exploration
- Exploratory Data Analysis (EDA)
- Data preprocessing
- Multiple classification algorithms
- Model evaluation and comparison

Let's get started! 🚀

## 1. Import Libraries

First, let's import all the necessary libraries for our analysis.

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")

## 2. Data Loading and Initial Exploration

Let's load the famous Iris dataset and explore its structure.

In [None]:
# Load the iris dataset
iris = load_iris()

# Create a DataFrame for easier manipulation
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target
df['species_name'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("📊 Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Features: {list(iris.feature_names)}")
print(f"Target classes: {list(iris.target_names)}")

In [None]:
# Display first few rows
print("🔍 First 5 rows of the dataset:")
df.head()

In [None]:
# Basic information about the dataset
print("📈 Dataset Information:")
print(f"Total samples: {len(df)}")
print(f"Features: {df.shape[1] - 2}")
print(f"Missing values: {df.isnull().sum().sum()}")
print("\n📊 Class distribution:")
print(df['species_name'].value_counts())

In [None]:
# Statistical summary
print("📊 Statistical Summary:")
df.describe()

## 3. Exploratory Data Analysis (EDA)

Let's visualize the data to understand the relationships between features and classes.

In [None]:
# Set up the plotting area
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('🌸 Distribution of Iris Features by Species', fontsize=16, fontweight='bold')

# Plot distributions for each feature
features = iris.feature_names
for i, feature in enumerate(features):
    row, col = i // 2, i % 2
    
    # Create histogram with species overlay
    for species in df['species_name'].unique():
        species_data = df[df['species_name'] == species][feature]
        axes[row, col].hist(species_data, alpha=0.7, label=species, bins=15)
    
    axes[row, col].set_title(f'{feature.title()}', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Create a pairplot to see relationships between features
plt.figure(figsize=(12, 10))
sns.pairplot(df, hue='species_name', markers=['o', 's', 'D'])
plt.suptitle('🔍 Pairwise Relationships Between Features', y=1.02, fontsize=16, fontweight='bold')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df[iris.feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('🔥 Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("🔍 Key Observations:")
print("• Petal length and petal width are highly correlated (0.96)")
print("• Sepal length and petal length are moderately correlated (0.87)")
print("• Sepal width has weak correlation with other features")

## 4. Data Preprocessing

Let's prepare our data for machine learning by splitting it and scaling the features.

In [None]:
# Prepare features and target
X = df[iris.feature_names]  # Features
y = df['species']           # Target

print("📊 Data prepared:")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {list(X.columns)}")

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✂️ Data split completed:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"Training set class distribution:")
print(pd.Series(y_train).value_counts().sort_index())

In [None]:
# Scale the features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("⚖️ Feature scaling completed!")
print(f"Original feature means: {X_train.mean().round(2).tolist()}")
print(f"Scaled feature means: {X_train_scaled.mean(axis=0).round(2).tolist()}")
print(f"Scaled feature stds: {X_train_scaled.std(axis=0).round(2).tolist()}")
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Model Building and Training\n",
    "\n",
    "Let's train multiple classification algorithms and compare their performance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize different classifiers\n",
    "models = {\n",
    "    'Logistic Regression': LogisticRegression(random_state=42),\n",
    "    'Decision Tree': DecisionTreeClassifier(random_state=42),\n",
    "    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),\n",
    "    'Support Vector Machine': SVC(random_state=42)\n",
    "}\n",
    "\n",
    "print(\"🤖 Models initialized:\")\n",
    "for name in models.keys():\n",
    "    print(f\"• {name}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train all models and store results\n",
    "results = {}\n",
    "trained_models = {}\n",
    "\n",
    "print(\"🏋️ Training models...\\n\")\n",
    "\n",
    "for name, model in models.items():\n",
    "    print(f\"Training {name}...\")\n",
    "    \n",
    "    # Train the model\n",
    "    model.fit(X_train_scaled, y_train)\n",
    "    \n",
    "    # Make predictions\n",
    "    y_pred = model.predict(X_test_scaled)\n",
    "    \n",
    "    # Calculate accuracy\n",
    "    accuracy = accuracy_score(y_test, y_pred)\n",
    "    \n",
    "    # Cross-validation score\n",
    "    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)\n",
    "    \n",
    "    # Store results\n",
    "    results[name] = {\n",
    "        'accuracy': accuracy,\n",
    "        'cv_mean': cv_scores.mean(),\n",
    "        'cv_std': cv_scores.std(),\n",
    "        'predictions': y_pred\n",
    "    }\n",
    "    \n",
    "    trained_models[name] = model\n",
    "    \n",
    "    print(f\"✅ {name} - Accuracy: {accuracy:.4f}, CV Score: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})\")\n",
    "\n",
    "print(\"\\n🎉 All models trained successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Model Evaluation and Comparison\n",
    "\n",
    "Let's evaluate our models using various metrics and visualizations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a comparison DataFrame\n",
    "comparison_df = pd.DataFrame({\n",
    "    'Model': list(results.keys()),\n",
    "    'Test Accuracy': [results[model]['accuracy'] for model in results.keys()],\n",
    "    'CV Mean': [results[model]['cv_mean'] for model in results.keys()],\n",
    "    'CV Std': [results[model]['cv_std'] for model in results.keys()]\n",
    "})\n",
    "\n",
    "comparison_df = comparison_df.sort_values('Test Accuracy', ascending=False)\n",
    "print(\"📊 Model Performance Comparison:\")\n",
    "print(comparison_df.round(4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize model performance\n",
    "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))\n",
    "\n",
    "# Test Accuracy comparison\n",
    "bars1 = ax1.bar(comparison_df['Model'], comparison_df['Test Accuracy'], \n",
    "                color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])\n",
    "ax1.set_title('🎯 Test Accuracy Comparison', fontweight='bold', fontsize=14)\n",
    "ax1.set_ylabel('Accuracy')\n",
    "ax1.set_ylim(0.8, 1.05)\n",
    "ax1.grid(True, alpha=0.3)\n",
    "\n",
    "# Add value labels on bars\n",
    "for bar in bars1:\n",
    "    height = bar.get_height()\n",
    "    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,\n",
    "             f'{height:.3f}', ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "# Cross-validation scores with error bars\n",
    "bars2 = ax2.bar(comparison_df['Model'], comparison_df['CV Mean'], \n",
    "                yerr=comparison_df['CV Std'], capsize=5,\n",
    "                color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])\n",
    "ax2.set_title('📈 Cross-Validation Scores', fontweight='bold', fontsize=14)\n",
    "ax2.set_ylabel('CV Score')\n",
    "ax2.set_ylim(0.8, 1.05)\n",
    "ax2.grid(True, alpha=0.3)\n",
    "\n",
    "# Add value labels on bars\n",
    "for bar in bars2:\n",
    "    height = bar.get_height()\n",
    "    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,\n",
    "             f'{height:.3f}', ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}