# Statistical Analysis: Descriptive, Inferential, and Exploratory

This notebook performs comprehensive statistical analysis on the treatment starts dataset.

## Objectives
- Descriptive Statistics: Summary statistics, distributions, central tendencies
- Inferential Statistics: Hypothesis testing, confidence intervals
- Exploratory Statistics: Data patterns, relationships, anomalies


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import normaltest, shapiro, ttest_ind, mannwhitneyu
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../../data/mock_treatment_starts_2016.csv')
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()


In [None]:
# Data preprocessing
df['TreatmentStart'] = pd.to_datetime(df['TreatmentStart'], format='%m/%d/%y')
df['Year'] = df['TreatmentStart'].dt.year
df['Month'] = df['TreatmentStart'].dt.month
df['Day'] = df['TreatmentStart'].dt.day
df['MonthName'] = df['TreatmentStart'].dt.strftime('%B')

# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print("\nDataset info:")
df.info()


## 1. Descriptive Statistics


In [None]:
# Descriptive statistics for numerical variables
print("=" * 60)
print("DESCRIPTIVE STATISTICS - DOSAGE")
print("=" * 60)
print(df['Dosage'].describe())
print("\n" + "=" * 60)
print("Summary by Drug")
print("=" * 60)
print(df.groupby('Drug')['Dosage'].describe())


In [None]:
# Measures of central tendency
print("Measures of Central Tendency - Dosage")
print("-" * 40)
print(f"Mean: {df['Dosage'].mean():.2f}")
print(f"Median: {df['Dosage'].median():.2f}")
print(f"Mode: {df['Dosage'].mode().values[0] if len(df['Dosage'].mode()) > 0 else 'No mode'}")
print(f"\nMeasures of Dispersion")
print("-" * 40)
print(f"Standard Deviation: {df['Dosage'].std():.2f}")
print(f"Variance: {df['Dosage'].var():.2f}")
print(f"Range: {df['Dosage'].max() - df['Dosage'].min():.2f}")
print(f"IQR: {df['Dosage'].quantile(0.75) - df['Dosage'].quantile(0.25):.2f}")
print(f"CV (Coefficient of Variation): {(df['Dosage'].std() / df['Dosage'].mean()) * 100:.2f}%")


In [None]:
# Skewness and Kurtosis
print("Shape Statistics - Dosage")
print("-" * 40)
print(f"Skewness: {df['Dosage'].skew():.4f}")
print(f"Kurtosis: {df['Dosage'].kurtosis():.4f}")

# Interpretation
if abs(df['Dosage'].skew()) < 0.5:
    print("Distribution is approximately symmetric")
elif df['Dosage'].skew() > 0.5:
    print("Distribution is right-skewed (positive skew)")
else:
    print("Distribution is left-skewed (negative skew)")


In [None]:
# Categorical descriptive statistics
print("=" * 60)
print("DESCRIPTIVE STATISTICS - CATEGORICAL VARIABLES")
print("=" * 60)
print("\nDrug Distribution:")
print(df['Drug'].value_counts())
print(f"\nProportions:\n{df['Drug'].value_counts(normalize=True) * 100}")

print("\n" + "=" * 60)
print("Treatment Starts by Month")
print("=" * 60)
print(df['MonthName'].value_counts().sort_index())


## 2. Inferential Statistics


In [None]:
# Test for normality
cisplatin_dosage = df[df['Drug'] == 'Cisplatin']['Dosage']
nivolumab_dosage = df[df['Drug'] == 'Nivolumab']['Dosage']

# Shapiro-Wilk test (for smaller samples)
print("=" * 60)
print("NORMALITY TESTS")
print("=" * 60)

# Note: Shapiro-Wilk works best for sample sizes < 50
if len(cisplatin_dosage) <= 50:
    stat_cis, p_cis = shapiro(cisplatin_dosage)
    print(f"\nCisplatin - Shapiro-Wilk Test:")
    print(f"  Statistic: {stat_cis:.4f}, p-value: {p_cis:.4f}")
    print(f"  Normal: {'Yes' if p_cis > 0.05 else 'No'}")

if len(nivolumab_dosage) <= 50:
    stat_niv, p_niv = shapiro(nivolumab_dosage)
    print(f"\nNivolumab - Shapiro-Wilk Test:")
    print(f"  Statistic: {stat_niv:.4f}, p-value: {p_niv:.4f}")
    print(f"  Normal: {'Yes' if p_niv > 0.05 else 'No'}")

# D'Agostino's K^2 test (works for larger samples)
stat_dag_cis, p_dag_cis = normaltest(cisplatin_dosage)
stat_dag_niv, p_dag_niv = normaltest(nivolumab_dosage)

print(f"\nCisplatin - D'Agostino's K^2 Test:")
print(f"  Statistic: {stat_dag_cis:.4f}, p-value: {p_dag_cis:.4f}")
print(f"  Normal: {'Yes' if p_dag_cis > 0.05 else 'No'}")

print(f"\nNivolumab - D'Agostino's K^2 Test:")
print(f"  Statistic: {stat_dag_niv:.4f}, p-value: {p_dag_niv:.4f}")
print(f"  Normal: {'Yes' if p_dag_niv > 0.05 else 'No'}")


In [None]:
# Hypothesis Testing: Compare mean dosages between drugs
print("=" * 60)
print("HYPOTHESIS TESTING: COMPARING DOSAGE MEANS")
print("=" * 60)

# H0: Mean dosage of Cisplatin = Mean dosage of Nivolumab
# H1: Mean dosage of Cisplatin ≠ Mean dosage of Nivolumab

# T-test (parametric) - assuming equal variances
t_stat, p_value_ttest = ttest_ind(cisplatin_dosage, nivolumab_dosage, equal_var=False)
print(f"\nIndependent t-test (Welch's):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value_ttest:.4f}")
print(f"  Significant difference: {'Yes' if p_value_ttest < 0.05 else 'No'} (α=0.05)")

# Mann-Whitney U test (non-parametric)
u_stat, p_value_mw = mannwhitneyu(cisplatin_dosage, nivolumab_dosage, alternative='two-sided')
print(f"\nMann-Whitney U test (non-parametric):")
print(f"  U-statistic: {u_stat:.4f}")
print(f"  p-value: {p_value_mw:.4f}")
print(f"  Significant difference: {'Yes' if p_value_mw < 0.05 else 'No'} (α=0.05)")


In [None]:
# Confidence Intervals
print("=" * 60)
print("CONFIDENCE INTERVALS (95%)")
print("=" * 60)

# For Cisplatin
cis_mean = cisplatin_dosage.mean()
cis_std = cisplatin_dosage.std()
cis_n = len(cisplatin_dosage)
cis_se = cis_std / np.sqrt(cis_n)
cis_ci = stats.t.interval(0.95, cis_n - 1, loc=cis_mean, scale=cis_se)
print(f"\nCisplatin Mean Dosage:")
print(f"  Mean: {cis_mean:.2f}")
print(f"  95% CI: [{cis_ci[0]:.2f}, {cis_ci[1]:.2f}]")

# For Nivolumab
niv_mean = nivolumab_dosage.mean()
niv_std = nivolumab_dosage.std()
niv_n = len(nivolumab_dosage)
niv_se = niv_std / np.sqrt(niv_n)
niv_ci = stats.t.interval(0.95, niv_n - 1, loc=niv_mean, scale=niv_se)
print(f"\nNivolumab Mean Dosage:")
print(f"  Mean: {niv_mean:.2f}")
print(f"  95% CI: [{niv_ci[0]:.2f}, {niv_ci[1]:.2f}]")


## 3. Exploratory Statistics


In [None]:
# Exploratory: Detect outliers using IQR method
Q1 = df['Dosage'].quantile(0.25)
Q3 = df['Dosage'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Dosage'] < lower_bound) | (df['Dosage'] > upper_bound)]
print("=" * 60)
print("OUTLIER DETECTION (IQR Method)")
print("=" * 60)
print(f"Lower bound: {lower_bound:.2f}")
print(f"Upper bound: {upper_bound:.2f}")
print(f"\nNumber of outliers: {len(outliers)}")
if len(outliers) > 0:
    print("\nOutliers:")
    print(outliers[['PatientID', 'Drug', 'Dosage']])


In [None]:
# Exploratory: Temporal patterns
print("=" * 60)
print("TEMPORAL PATTERNS")
print("=" * 60)
print("\nTreatment starts by month:")
monthly_counts = df.groupby('MonthName')['PatientID'].count().sort_index()
print(monthly_counts)

print("\nAverage dosage by month:")
monthly_dosage = df.groupby('MonthName')['Dosage'].mean().sort_index()
print(monthly_dosage)


In [None]:
# Visualizations for exploratory statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Distribution of Dosage
axes[0, 0].hist(df['Dosage'], bins=15, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(df['Dosage'].mean(), color='r', linestyle='--', label=f'Mean: {df["Dosage"].mean():.2f}')
axes[0, 0].axvline(df['Dosage'].median(), color='g', linestyle='--', label=f'Median: {df["Dosage"].median():.2f}')
axes[0, 0].set_title('Distribution of Dosage')
axes[0, 0].set_xlabel('Dosage')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Box plot by Drug
df.boxplot(column='Dosage', by='Drug', ax=axes[0, 1])
axes[0, 1].set_title('Dosage Distribution by Drug')
axes[0, 1].set_xlabel('Drug')
axes[0, 1].set_ylabel('Dosage')

# 3. Treatment starts over time
monthly_counts.plot(kind='bar', ax=axes[1, 0], color='steelblue')
axes[1, 0].set_title('Treatment Starts by Month')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Count')
axes[1, 0].tick_params(axis='x', rotation=45)

# 4. Q-Q plot for normality check
stats.probplot(df['Dosage'], dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot: Dosage Normality Check')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
