In [6]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the dataset
df = pd.read_csv("diabetes.csv")

# Loop through all columns
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        print(f"\n📊 Descriptive Statistics for: {col}")
        print("-" * 50)
        print(f"Mean: {df[col].mean():.2f}")
        print(f"Mode: {df[col].mode().values[0]:.2f}")
        print(f"Median: {df[col].median():.2f}")
        print(f"Standard Deviation: {df[col].std():.2f}")
        print(f"Variance: {df[col].var():.2f}")
        print(f"Standard Error: {stats.sem(df[col], nan_policy='omit'):.2f}")
        print(f"Min: {df[col].min():.2f}")
        print(f"Max: {df[col].max():.2f}")
        print(f"25th Percentile: {np.percentile(df[col], 25):.2f}")
        print(f"50th Percentile: {np.percentile(df[col], 50):.2f}")
        print(f"75th Percentile: {np.percentile(df[col], 75):.2f}")
        print(f"Skewness: {df[col].skew():.2f}")
        print(f"Kurtosis: {df[col].kurt():.2f}")
        
        # ANOVA based on 'Outcome' grouping if available
        if 'Outcome' in df.columns and df['Outcome'].nunique() > 1 and col != 'Outcome':
            groups = [group[col].dropna() for _, group in df.groupby('Outcome')]
            if len(groups) > 1:
                f_val, p_val = stats.f_oneway(*groups)
                print(f"ANOVA F-value: {f_val:.2f}")
                print(f"ANOVA p-value: {p_val:.4f}")




📊 Descriptive Statistics for: Pregnancies
--------------------------------------------------
Mean: 3.85
Mode: 1.00
Median: 3.00
Standard Deviation: 3.37
Variance: 11.35
Standard Error: 0.12
Min: 0.00
Max: 17.00
25th Percentile: 1.00
50th Percentile: 3.00
75th Percentile: 6.00
Skewness: 0.90
Kurtosis: 0.16
ANOVA F-value: 39.67
ANOVA p-value: 0.0000

📊 Descriptive Statistics for: Glucose
--------------------------------------------------
Mean: 120.89
Mode: 99.00
Median: 117.00
Standard Deviation: 31.97
Variance: 1022.25
Standard Error: 1.15
Min: 0.00
Max: 199.00
25th Percentile: 99.00
50th Percentile: 117.00
75th Percentile: 140.25
Skewness: 0.17
Kurtosis: 0.64
ANOVA F-value: 213.16
ANOVA p-value: 0.0000

📊 Descriptive Statistics for: BloodPressure
--------------------------------------------------
Mean: 69.11
Mode: 70.00
Median: 72.00
Standard Deviation: 19.36
Variance: 374.65
Standard Error: 0.70
Min: 0.00
Max: 122.00
25th Percentile: 62.00
50th Percentile: 72.00
75th Percentile: 80.0

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Load dataset
df = pd.read_csv("diabetes.csv")

# Create an empty list to collect each column's stats as a dict
stats_list = []

# Loop through all numeric columns
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        data = df[col].dropna()
        stats_dict = {
            "Column": col,
            "Mean": data.mean(),
            "Mode": data.mode().values[0],
            "Median": data.median(),
            "Std Dev": data.std(),
            "Variance": data.var(),
            "Std Error": stats.sem(data),
            "Min": data.min(),
            "Max": data.max(),
            "25%": np.percentile(data, 25),
            "50%": np.percentile(data, 50),
            "75%": np.percentile(data, 75),
            "Skewness": data.skew(),
            "Kurtosis": data.kurt(),
        }

        # Add ANOVA if 'Outcome' is available and not the current column
        if 'Outcome' in df.columns and col != 'Outcome' and df['Outcome'].nunique() > 1:
            groups = [group[col].dropna() for _, group in df.groupby('Outcome')]
            if len(groups) > 1:
                f_val, p_val = stats.f_oneway(*groups)
                stats_dict["ANOVA_F"] = f_val
                stats_dict["ANOVA_p"] = p_val
            else:
                stats_dict["ANOVA_F"] = None
                stats_dict["ANOVA_p"] = None
        else:
            stats_dict["ANOVA_F"] = None
            stats_dict["ANOVA_p"] = None

        stats_list.append(stats_dict)

# Convert to DataFrame
stats_df = pd.DataFrame(stats_list)

# Display the stats DataFrame
print(stats_df.round(2))


                     Column    Mean   Mode  Median  Std Dev  Variance  \
0               Pregnancies    3.85   1.00    3.00     3.37     11.35   
1                   Glucose  120.89  99.00  117.00    31.97   1022.25   
2             BloodPressure   69.11  70.00   72.00    19.36    374.65   
3             SkinThickness   20.54   0.00   23.00    15.95    254.47   
4                   Insulin   79.80   0.00   30.50   115.24  13281.18   
5                       BMI   31.99  32.00   32.00     7.88     62.16   
6  DiabetesPedigreeFunction    0.47   0.25    0.37     0.33      0.11   
7                       Age   33.24  22.00   29.00    11.76    138.30   
8                   Outcome    0.35   0.00    0.00     0.48      0.23   

   Std Error    Min     Max    25%     50%     75%  Skewness  Kurtosis  \
0       0.12   0.00   17.00   1.00    3.00    6.00      0.90      0.16   
1       1.15   0.00  199.00  99.00  117.00  140.25      0.17      0.64   
2       0.70   0.00  122.00  62.00   72.00   80