# Pima Indians Diabetes Dataset - Statistical Analysis

## Descriptive, Inferential, and Exploratory Statistical Analysis

This notebook performs comprehensive statistical analysis on the Pima Indians Diabetes dataset.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency, normaltest, shapiro
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


In [None]:
# Load the dataset
df = pd.read_csv('../../data/pima-indians-diabetes.csv', skiprows=9, header=None)

# Column names based on dataset description
columns = [
    'Pregnancies',
    'Glucose',
    'BloodPressure',
    'SkinThickness',
    'Insulin',
    'BMI',
    'DiabetesPedigreeFunction',
    'Age',
    'Outcome'
]

df.columns = columns

print(f"Dataset Shape: {df.shape}")
print(f"\nColumn Names: {list(df.columns)}")
print(f"\nData Types:\n{df.dtypes}")
df.head()


## 1. Descriptive Statistics


In [None]:
# Basic descriptive statistics
print("=" * 60)
print("DESCRIPTIVE STATISTICS")
print("=" * 60)
print(df.describe())


In [None]:
# Additional descriptive statistics
print("\n" + "=" * 60)
print("ADDITIONAL STATISTICS")
print("=" * 60)

numeric_cols = df.select_dtypes(include=[np.number]).columns

desc_stats = pd.DataFrame({
    'Mean': df[numeric_cols].mean(),
    'Median': df[numeric_cols].median(),
    'Std Dev': df[numeric_cols].std(),
    'Variance': df[numeric_cols].var(),
    'Skewness': df[numeric_cols].skew(),
    'Kurtosis': df[numeric_cols].kurtosis(),
    'Min': df[numeric_cols].min(),
    'Max': df[numeric_cols].max(),
    'Range': df[numeric_cols].max() - df[numeric_cols].min(),
    'IQR': df[numeric_cols].quantile(0.75) - df[numeric_cols].quantile(0.25)
})

print(desc_stats)


In [None]:
# Missing values and zero values analysis
print("\n" + "=" * 60)
print("MISSING VALUES ANALYSIS")
print("=" * 60)

missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

# Check for zeros (which might represent missing values in this dataset)
print("\n" + "=" * 60)
print("ZERO VALUES (Potential Missing Data)")
print("=" * 60)
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    zero_count = (df[col] == 0).sum()
    print(f"{col}: {zero_count} zeros ({zero_count/len(df)*100:.2f}%)")


In [None]:
# Outcome distribution
print("\n" + "=" * 60)
print("OUTCOME DISTRIBUTION")
print("=" * 60)
outcome_counts = df['Outcome'].value_counts()
outcome_pct = df['Outcome'].value_counts(normalize=True) * 100
outcome_df = pd.DataFrame({
    'Count': outcome_counts,
    'Percentage': outcome_pct
})
print(outcome_df)

# Visualize outcome distribution
plt.figure(figsize=(8, 6))
outcome_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Diabetes Outcomes')
plt.xlabel('Outcome (0 = No Diabetes, 1 = Diabetes)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()


## 2. Inferential Statistics


In [None]:
# T-tests: Compare means between diabetic and non-diabetic groups
print("=" * 60)
print("INDEPENDENT T-TESTS (Diabetic vs Non-Diabetic)")
print("=" * 60)

diabetic = df[df['Outcome'] == 1]
non_diabetic = df[df['Outcome'] == 0]

numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                   'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

ttest_results = []
for col in numeric_features:
    stat, p_value = ttest_ind(diabetic[col], non_diabetic[col])
    
    ttest_results.append({
        'Feature': col,
        'T-statistic': stat,
        'P-value': p_value,
        'Significant': 'Yes' if p_value < 0.05 else 'No',
        'Diabetic Mean': diabetic[col].mean(),
        'Non-Diabetic Mean': non_diabetic[col].mean(),
        'Difference': diabetic[col].mean() - non_diabetic[col].mean()
    })

ttest_df = pd.DataFrame(ttest_results)
print(ttest_df)


In [None]:
# Correlation analysis
print("\n" + "=" * 60)
print("CORRELATION ANALYSIS")
print("=" * 60)

correlation_matrix = df[numeric_features + ['Outcome']].corr()
print(correlation_matrix['Outcome'].sort_values(ascending=False))

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Chi-square test for categorical associations
print("\n" + "=" * 60)
print("CHI-SQUARE TESTS")
print("=" * 60)

# Create age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 100], 
                        labels=['<30', '30-40', '40-50', '50+'])

# Create BMI categories
df['BMICategory'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 100], 
                           labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# Chi-square test: Age Group vs Outcome
age_outcome = pd.crosstab(df['AgeGroup'], df['Outcome'])
chi2, p_value, dof, expected = chi2_contingency(age_outcome)
print(f"Age Group vs Outcome:")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Significant: {'Yes' if p_value < 0.05 else 'No'}")
print(f"\nContingency Table:")
print(age_outcome)


In [None]:
# Calculate 95% confidence intervals for key features
print("=" * 60)
print("95% CONFIDENCE INTERVALS")
print("=" * 60)

confidence_intervals = []
for col in numeric_features:
    mean = df[col].mean()
    std = df[col].std()
    n = len(df[col])
    
    # 95% CI using t-distribution
    margin_error = stats.t.interval(0.95, n-1, loc=mean, scale=std/np.sqrt(n))[1] - mean
    ci_lower = mean - margin_error
    ci_upper = mean + margin_error
    
    confidence_intervals.append({
        'Feature': col,
        'Mean': mean,
        'CI Lower': ci_lower,
        'CI Upper': ci_upper,
        'Margin of Error': margin_error
    })

ci_df = pd.DataFrame(confidence_intervals)
print(ci_df)


## 4. Summary of Statistical Findings


In [None]:
print("=" * 60)
print("STATISTICAL ANALYSIS SUMMARY")
print("=" * 60)
print("\n1. Descriptive Statistics:")
print("   - Dataset contains 768 records with 8 features")
print("   - Outcome distribution shows class imbalance")
print("   - Several features contain zeros that may represent missing data")

print("\n2. Inferential Statistics:")
print("   - T-tests reveal significant differences between diabetic and non-diabetic groups")
print("   - Glucose, BMI, and Age show strong correlations with Outcome")
print("   - Chi-square tests indicate associations between categorical variables and outcome")

print("\n3. Key Insights:")
print("   - Higher glucose levels are strongly associated with diabetes")
print("   - BMI and age are important predictors")
print("   - Data quality issues (zeros) need to be addressed")
print("=" * 60)
