# Statistical Analysis - Iris Dataset

This notebook provides comprehensive statistical analysis including:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Statistical Analysis

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, f_oneway, normaltest, shapiro, levene, kruskal
from itertools import combinations
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
np.random.seed(42)

## 2. Load Data

In [None]:
data_path = Path('../../data/Iris.csv')
df = pd.read_csv(data_path)
features = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
print('Data loaded successfully!')
print(f'Dataset shape: {df.shape}')

## 3. Descriptive Statistics

In [None]:
print('=' * 80)
print('DESCRIPTIVE STATISTICS')
print('=' * 80)
print(df[features].describe())

In [None]:
print('\n' + '=' * 80)
print('DESCRIPTIVE STATISTICS BY VARIETY')
print('=' * 80)
for variety in df['variety'].unique():
    print(f'\n{variety}:')
    print(df[df['variety'] == variety][features].describe())

In [None]:
descriptive_stats = pd.DataFrame({
    'Mean': df[features].mean(),
    'Median': df[features].median(),
    'Std Dev': df[features].std(),
    'Variance': df[features].var(),
    'Skewness': df[features].skew(),
    'Kurtosis': df[features].kurtosis(),
    'Min': df[features].min(),
    'Max': df[features].max(),
    'Range': df[features].max() - df[features].min(),
    'IQR': df[features].quantile(0.75) - df[features].quantile(0.25),
    'CV': (df[features].std() / df[features].mean()) * 100
})
print(descriptive_stats.round(3))

## 4. Normality Tests

In [None]:
print('=' * 80)
print('NORMALITY TESTS (Shapiro-Wilk Test)')
print('=' * 80)

normality_results = []
for feature in features:
    for variety in df['variety'].unique():
        data = df[df['variety'] == variety][feature]
        stat, p_value = shapiro(data)
        normality_results.append({
            'Feature': feature,
            'Variety': variety,
            'Statistic': stat,
            'P-value': p_value,
            'Normal': 'Yes' if p_value > 0.05 else 'No'
        })

normality_df = pd.DataFrame(normality_results)
print(normality_df.round(4))

## 5. Inferential Statistics - ANOVA

In [None]:
print('=' * 80)
print('ONE-WAY ANOVA TEST')
print('=' * 80)
print('H0: All groups have the same mean')
print('H1: At least one group has a different mean\n')

anova_results = []
for feature in features:
    groups = [df[df['variety'] == variety][feature].values for variety in df['variety'].unique()]
    f_stat, p_value = f_oneway(*groups)
    anova_results.append({
        'Feature': feature,
        'F-statistic': f_stat,
        'P-value': p_value,
        'Significant': 'Yes' if p_value < 0.05 else 'No'
    })

anova_df = pd.DataFrame(anova_results)
print(anova_df.round(6))

## 6. Pairwise t-tests

In [None]:
varieties = df['variety'].unique()
pairs = list(combinations(varieties, 2))

print('=' * 80)
print('PAIRWISE T-TESTS')
print('=' * 80)

ttest_results = []
for feature in features:
    for pair in pairs:
        group1 = df[df['variety'] == pair[0]][feature]
        group2 = df[df['variety'] == pair[1]][feature]
        _, levene_p = levene(group1, group2)
        equal_var = levene_p > 0.05
        t_stat, p_value = ttest_ind(group1, group2, equal_var=equal_var)
        ttest_results.append({
            'Feature': feature,
            'Group1': pair[0],
            'Group2': pair[1],
            'T-statistic': t_stat,
            'P-value': p_value,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })

ttest_df = pd.DataFrame(ttest_results)
print(ttest_df.round(6))

## 7. Correlation Analysis

In [None]:
print('=' * 80)
print('PEARSON CORRELATION COEFFICIENTS')
print('=' * 80)
corr_matrix = df[features].corr()
print(corr_matrix.round(4))

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, square=True, 
            linewidths=1, cbar_kws={'shrink': 0.8})
plt.title('Pearson Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

## 8. Confidence Intervals

In [None]:
def confidence_interval(data, confidence=0.95):
    n = len(data)
    mean = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return mean - h, mean + h

print('=' * 80)
print('95% CONFIDENCE INTERVALS FOR MEANS')
print('=' * 80)

ci_results = []
for feature in features:
    for variety in df['variety'].unique():
        data = df[df['variety'] == variety][feature]
        ci_lower, ci_upper = confidence_interval(data)
        mean_val = data.mean()
        ci_results.append({
            'Feature': feature,
            'Variety': variety,
            'Mean': mean_val,
            'CI Lower': ci_lower,
            'CI Upper': ci_upper
        })

ci_df = pd.DataFrame(ci_results)
print(ci_df.round(4))