# Exploratory Data Analysis (EDA) - Iris Dataset

This notebook provides a comprehensive exploratory data analysis of the Iris dataset.

## Objectives
1. Load and understand the dataset
2. Perform data quality checks
3. Generate descriptive statistics
4. Create visualizations
5. Identify patterns and relationships
6. Detect outliers and anomalies

## Dataset Information
The Iris dataset contains measurements of 150 iris flowers from three different species:
- Setosa
- Versicolor
- Virginica

Features:
- Sepal length (cm)
- Sepal width (cm)
- Petal length (cm)
- Petal width (cm)
- Variety (target variable)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import os

warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")


## 1. Data Loading


In [None]:
# Load the dataset
data_path = Path('../../data/Iris.csv')
df = pd.read_csv(data_path)

print("Dataset loaded successfully!")
print(f"\nDataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")


## 2. Data Overview


In [None]:
# Display first few rows
df.head()


In [None]:
# Display last few rows
df.tail()


In [None]:
# Dataset information
print("Dataset Info:")
print("=" * 80)
df.info()


In [None]:
# Check for missing values
print("Missing Values:")
print("=" * 80)
missing = df.isnull().sum()
print(missing)
print(f"\nTotal missing values: {missing.sum()}")
print(f"\nPercentage of missing values: {(missing.sum() / len(df)) * 100:.2f}%")


In [None]:
# Check for duplicates
print("Duplicate Rows:")
print("=" * 80)
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


## 3. Descriptive Statistics


In [None]:
# Basic descriptive statistics
df.describe()


In [None]:
# Target variable distribution
print("Target Variable Distribution:")
print("=" * 80)
print(df['variety'].value_counts())
print("\nProportions:")
print(df['variety'].value_counts(normalize=True))


## 4. Visualizations


In [None]:
# Pair plot
sns.pairplot(df, hue='variety', diag_kind='hist', height=2.5)
plt.suptitle('Pair Plot of Iris Dataset', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Distribution of each feature by species
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
features = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']

for idx, feature in enumerate(features):
    ax = axes[idx // 2, idx % 2]
    for variety in df['variety'].unique():
        data = df[df['variety'] == variety][feature]
        ax.hist(data, alpha=0.6, label=variety, bins=20, edgecolor='black')
    ax.set_xlabel(feature, fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    ax.set_title(f'Distribution of {feature}', fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Features', fontsize=16, pad=20)
plt.tight_layout()
plt.show()


In [None]:
# Violin plots for better distribution visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
for idx, feature in enumerate(features):
    ax = axes[idx // 2, idx % 2]
    sns.violinplot(data=df, x='variety', y=feature, ax=ax)
    ax.set_title(f'Violin Plot of {feature} by Variety', fontsize=14)
    ax.set_xlabel('Variety', fontsize=12)
    ax.set_ylabel(feature, fontsize=12)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 5. Outlier Detection


In [None]:
# Detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print("Outlier Detection (IQR Method):")
print("=" * 80)
for feature in features:
    outliers, lower, upper = detect_outliers_iqr(df, feature)
    print(f"\n{feature}:")
    print(f"  Lower bound: {lower:.2f}")
    print(f"  Upper bound: {upper:.2f}")
    print(f"  Number of outliers: {len(outliers)}")
    if len(outliers) > 0:
        print(f"  Outlier indices: {outliers.index.tolist()}")


In [None]:
# Visualize outliers using box plots with annotations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
for idx, feature in enumerate(features):
    ax = axes[idx // 2, idx % 2]
    bp = df.boxplot(column=feature, by='variety', ax=ax, return_type='dict')
    ax.set_title(f'Outliers in {feature} by Variety', fontsize=14)
    ax.set_xlabel('Variety', fontsize=12)
    ax.set_ylabel(feature, fontsize=12)
    plt.suptitle('')

plt.tight_layout()
plt.show()


## 6. Data Quality Assessment


In [None]:
# Data quality summary
print("DATA QUALITY SUMMARY")
print("=" * 80)
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"Data types: {df.dtypes.value_counts().to_dict()}")
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
print(f"\nUnique values per column:")
for col in df.columns:
    print(f"  {col}: {df[col].nunique()}")


## 7. Key Insights and Conclusions


In [None]:
# Summary of key findings
print("KEY INSIGHTS")
print("=" * 80)
print("\n1. Dataset is clean with no missing values")
print("2. Balanced dataset with 50 samples per class")
print("3. All features are numerical and continuous")
print("4. Strong correlations exist between petal measurements")
print("5. Setosa class appears to be linearly separable from other classes")
print("\nNext steps:")
print("- Perform statistical analysis")
print("- Conduct univariate, bivariate, and multivariate analysis")
print("- Build machine learning models")


In [None]:
# Box plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
for idx, feature in enumerate(features):
    ax = axes[idx // 2, idx % 2]
    df.boxplot(column=feature, by='variety', ax=ax)
    ax.set_title(f'Box Plot of {feature} by Variety', fontsize=14)
    ax.set_xlabel('Variety', fontsize=12)
    ax.set_ylabel(feature, fontsize=12)
    plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()
