# Iris Dataset Analysis

This notebook provides an interactive analysis of the famous Iris dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target
df['species_name'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

## Dataset Overview

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())
print("\nSpecies Distribution:")
print(df['species_name'].value_counts())

## Data Visualization

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Iris Dataset Exploratory Data Analysis', fontsize=16)

# 1. Distribution of features
df.iloc[:, :4].hist(bins=20, ax=axes[0, 0], alpha=0.7)
axes[0, 0].set_title('Feature Distributions')

# 2. Correlation heatmap
correlation_matrix = df.iloc[:, :4].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axes[0, 1])
axes[0, 1].set_title('Feature Correlation Heatmap')

# 3. Scatter plot for species
sns.scatterplot(data=df, x='sepal length (cm)', y='petal length (cm)', 
                hue='species_name', ax=axes[1, 0])
axes[1, 0].set_title('Sepal Length vs Petal Length')

# 4. Box plot for species comparison
df_melted = df.melt(id_vars=['species_name'], 
                    value_vars=['sepal length (cm)', 'sepal width (cm)', 
                               'petal length (cm)', 'petal width (cm)'])
sns.boxplot(data=df_melted, x='variable', y='value', hue='species_name', ax=axes[1, 1])
axes[1, 1].set_title('Feature Comparison by Species')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Machine Learning Model

In [None]:
# Prepare data for machine learning
X = iris.data
y = iris.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, predictions, target_names=iris.target_names))

## Feature Importance

In [None]:
# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': iris.feature_names,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance in Random Forest Model')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print("Feature Importance:")
print(feature_importance)