# Descriptive, Inferential, and Exploratory Statistical Analysis

## Cancer Incidence Data Analysis

This notebook covers:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Data Analysis


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import normaltest, shapiro, kstest
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


## 1. Data Loading and Initial Exploration


In [None]:
# Load the dataset with encoding handling
try:
    df = pd.read_csv('../data/incd.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv('../data/incd.csv', encoding='latin-1')

print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nFirst few rows:")
df.head()


In [None]:
# Clean column names and prepare data
import sys
sys.path.append('../../scripts/python')
from data_loader import clean_data

df_clean = clean_data(df)
print(f"Clean dataset shape: {df_clean.shape}")
df_clean.head()


## 2. Descriptive Statistics


In [None]:
# Import descriptive statistics functions
from descriptive_stats import calculate_descriptive_stats

# Calculate descriptive statistics
stats = calculate_descriptive_stats(df_clean)


## 3. Inferential Statistics


In [None]:
# Normality tests
from scipy.stats import normaltest, shapiro

sample_size = len(df_clean['Incidence_Rate'])
if sample_size <= 5000:
    shapiro_stat, shapiro_p = shapiro(df_clean['Incidence_Rate'].sample(min(5000, sample_size)))
    print(f"Shapiro-Wilk Test: statistic={shapiro_stat:.4f}, p-value={shapiro_p:.4f}")

dago_stat, dago_p = normaltest(df_clean['Incidence_Rate'])
print(f"D'Agostino Test: statistic={dago_stat:.4f}, p-value={dago_p:.4f}")

# Confidence intervals
mean_rate = df_clean['Incidence_Rate'].mean()
std_rate = df_clean['Incidence_Rate'].std()
n = len(df_clean['Incidence_Rate'])
se = std_rate / np.sqrt(n)
ci_95 = stats.t.interval(0.95, n-1, loc=mean_rate, scale=se)
print(f"95% CI for mean: [{ci_95[0]:.2f}, {ci_95[1]:.2f}]")


## 4. Exploratory Visualizations


In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

axes[0, 0].hist(df_clean['Incidence_Rate'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of Incidence Rate')
axes[0, 0].axvline(df_clean['Incidence_Rate'].mean(), color='r', linestyle='--', label='Mean')
axes[0, 0].legend()

axes[0, 1].boxplot(df_clean['Incidence_Rate'])
axes[0, 1].set_title('Box Plot of Incidence Rate')

stats.probplot(df_clean['Incidence_Rate'], dist="norm", plot=axes[1, 0])
axes[1, 0].set_title('Q-Q Plot')

df_clean['Incidence_Rate'].plot(kind='density', ax=axes[1, 1])
axes[1, 1].set_title('Density Plot')

plt.tight_layout()
plt.show()
