# Statistical Analysis: Descriptive, Inferential, and Exploratory

## Breast Cancer Diagnosis Dataset

This notebook contains:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Statistical Analysis


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency, f_oneway, normaltest, shapiro
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


In [None]:
# Load the dataset
df = pd.read_csv('../../data/breast_cancer.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()


## 1. Descriptive Statistics


In [None]:
# Basic information about the dataset
print("=== DATASET INFORMATION ===")
print(f"Total samples: {len(df)}")
print(f"Total features: {len(df.columns)}")
print(f"\nMissing values:\n{df.isnull().sum().sum()}")
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Descriptive statistics for numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != 'id']

print("\n=== DESCRIPTIVE STATISTICS ===")
desc_stats = df[numerical_cols].describe()
print(desc_stats)

# Categorical variable analysis
print("\n=== DIAGNOSIS DISTRIBUTION ===")
diagnosis_counts = df['diagnosis'].value_counts()
diagnosis_props = df['diagnosis'].value_counts(normalize=True) * 100
print(diagnosis_counts)
print(f"\nProportions:\n{diagnosis_props}")


## 2. Inferential Statistics


In [None]:
# Separate data by diagnosis
malignant = df[df['diagnosis'] == 'M']
benign = df[df['diagnosis'] == 'B']

print(f"Malignant samples: {len(malignant)}")
print(f"Benign samples: {len(benign)}")

# T-tests to compare means
from scipy.stats import ttest_ind, mannwhitneyu

print("\n=== INDEPENDENT T-TESTS ===")
key_features = numerical_cols[:10]

for feature in key_features:
    t_stat, p_value = ttest_ind(malignant[feature], benign[feature])
    mean_m = malignant[feature].mean()
    mean_b = benign[feature].mean()
    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
    print(f"{feature}: t={t_stat:.4f}, p={p_value:.4f} {significance}")
    print(f"  Malignant mean: {mean_m:.4f}, Benign mean: {mean_b:.4f}")


## 3. Exploratory Statistical Analysis


In [None]:
# Correlation analysis
import os
os.makedirs('../../results', exist_ok=True)

correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../../results/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Find highly correlated features
print("=== HIGHLY CORRELATED FEATURES (|r| > 0.9) ===")
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            high_corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                correlation_matrix.iloc[i, j]
            ))

for pair in high_corr_pairs[:10]:
    print(f"{pair[0]} <-> {pair[1]}: {pair[2]:.4f}")
