# Iris Dataset Statistical Analysis

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from scipy import stats

# -----------------------------
# Load and Prepare Dataset
# -----------------------------
iris = load_iris()
data = pd.DataFrame(
    iris.data,
    columns=["sepal_length", "sepal_width", "petal_length", "petal_width"]
)
data["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)

print("First five rows of dataset:")
print(data.head())

# -----------------------------
# 1. Descriptive Statistics
# -----------------------------
summary_stats = data.groupby("species")["sepal_length"].agg(
    count="count",
    mean="mean",
    median="median",
    std="std",
    min="min",
    max="max"
)
summary_stats["range"] = summary_stats["max"] - summary_stats["min"]

print("\nSepal Length Summary Statistics by Species:\n")
print(summary_stats)

most_variable = summary_stats["std"].idxmax()
print(f"\nSpecies with greatest sepal length variation: {most_variable}")

# -----------------------------
# 2. Petal Length Distribution Measures
# -----------------------------
petal_stats = data.groupby("species")["petal_length"].agg(["mean", "median", "var", "std"])
petal_stats["iqr"] = data.groupby("species")["petal_length"].quantile(0.75) -                      data.groupby("species")["petal_length"].quantile(0.25)

print("\nPetal Length Statistics with IQR:\n")
print(petal_stats)

# -----------------------------
# 3. Visualizations
# -----------------------------
for sp in data["species"].unique():
    subset = data[data["species"] == sp]["petal_width"]
    plt.hist(subset, alpha=0.5, label=sp)

plt.xlabel("Petal Width")
plt.ylabel("Frequency")
plt.title("Petal Width Distribution by Species")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

data.boxplot(column="sepal_length", by="species")
plt.title("Sepal Length by Species")
plt.suptitle("")
plt.grid(alpha=0.3)
plt.show()

# -----------------------------
# 4. Skewness
# -----------------------------
skew_vals = data.groupby("species")["petal_length"].apply(stats.skew)
print("\nSkewness of Petal Length by Species:\n")
print(skew_vals)

# -----------------------------
# 5. Central Limit Theorem Demo
# -----------------------------
np.random.seed(42)
sample_means = [data["sepal_length"].sample(30, replace=True).mean() for _ in range(500)]

plt.hist(sample_means, bins=20)
plt.title("Sampling Distribution of Mean Sepal Length (n=30)")
plt.xlabel("Sample Mean")
plt.ylabel("Frequency")
plt.grid(alpha=0.3)
plt.show()

# -----------------------------
# 6. Correlation Analysis
# -----------------------------
pearson_r, _ = stats.pearsonr(data["sepal_length"], data["petal_length"])
spearman_r, _ = stats.spearmanr(data["sepal_length"], data["petal_length"])

print(f"\nPearson correlation: {pearson_r:.3f}")
print(f"Spearman correlation: {spearman_r:.3f}")

plt.scatter(data["sepal_length"], data["petal_length"])
m, b = np.polyfit(data["sepal_length"], data["petal_length"], 1)
plt.plot(data["sepal_length"], m * data["sepal_length"] + b)
plt.xlabel("Sepal Length")
plt.ylabel("Petal Length")
plt.title("Sepal vs Petal Length with Regression Line")
plt.grid(alpha=0.3)
plt.show()

# -----------------------------
# 7. Welchâ€™s t-Test
# -----------------------------
setosa_petal = data[data["species"] == "setosa"]["petal_length"]
versicolor_petal = data[data["species"] == "versicolor"]["petal_length"]

t_stat, p_val = stats.ttest_ind(setosa_petal, versicolor_petal, equal_var=False)
print(f"\nWelch t-test (Setosa vs Versicolor Petal Length): t = {t_stat:.3f}, p = {p_val:.3e}")

# -----------------------------
# 8. One-Way ANOVA
# -----------------------------
groups = [group["petal_width"].values for name, group in data.groupby("species")]
f_stat, p_val = stats.f_oneway(*groups)

print(f"\nANOVA for Petal Width Across Species: F = {f_stat:.3f}, p = {p_val:.3e}")

species_list = data["species"].unique()
print("\nPairwise t-tests with Bonferroni correction:")
for i in range(len(species_list)):
    for j in range(i+1, len(species_list)):
        g1 = data[data["species"] == species_list[i]]["petal_width"]
        g2 = data[data["species"] == species_list[j]]["petal_width"]
        t, p = stats.ttest_ind(g1, g2, equal_var=False)
        p_adj = p * 3
        print(f"{species_list[i]} vs {species_list[j]}: t={t:.3f}, adjusted p={p_adj:.3e}")

print("\nAnalysis complete.")
