# Comprehensive Exploratory Data Analysis (EDA)

This notebook performs a thorough exploratory data analysis of the treatment starts dataset.

## Objectives
- Data quality assessment
- Missing value analysis
- Outlier detection and treatment
- Distribution analysis
- Relationship exploration
- Temporal pattern analysis
- Data summary and insights


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../../data/mock_treatment_starts_2016.csv')
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()


In [None]:
# Data preprocessing and overview
df['TreatmentStart'] = pd.to_datetime(df['TreatmentStart'], format='%m/%d/%y')
df['Year'] = df['TreatmentStart'].dt.year
df['Month'] = df['TreatmentStart'].dt.month
df['Day'] = df['TreatmentStart'].dt.day
df['MonthName'] = df['TreatmentStart'].dt.strftime('%B')
df['Weekday'] = df['TreatmentStart'].dt.day_name()
df['Quarter'] = df['TreatmentStart'].dt.quarter

print("=" * 60)
print("DATA OVERVIEW")
print("=" * 60)
print(f"\nDataset shape: {df.shape}")
print(f"Date range: {df['TreatmentStart'].min()} to {df['TreatmentStart'].max()}")
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nData types:\n{df.dtypes}")
df.head()


In [None]:
# Outlier detection
Q1 = df['Dosage'].quantile(0.25)
Q3 = df['Dosage'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['Dosage'] < Q1 - 1.5*IQR) | (df['Dosage'] > Q3 + 1.5*IQR)]
print("Outliers:", len(outliers))
if len(outliers) > 0:
    print(outliers[['PatientID', 'Drug', 'Dosage']])

# Comprehensive visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Distribution
axes[0, 0].hist(df['Dosage'], bins=15, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Dosage Distribution')
axes[0, 0].set_xlabel('Dosage')
axes[0, 0].set_ylabel('Frequency')

# Box plot by Drug
sns.boxplot(data=df, x='Drug', y='Dosage', ax=axes[0, 1])
axes[0, 1].set_title('Dosage by Drug')

# Temporal pattern
month_order = ['January', 'February', 'March', 'April', 'May', 'June']
monthly_counts = df['MonthName'].value_counts().reindex(month_order, fill_value=0)
axes[0, 2].bar(range(len(monthly_counts)), monthly_counts.values)
axes[0, 2].set_xticks(range(len(monthly_counts)))
axes[0, 2].set_xticklabels(monthly_counts.index, rotation=45)
axes[0, 2].set_title('Treatment Starts by Month')

# Drug usage heatmap
drug_month = pd.crosstab(df['MonthName'], df['Drug']).reindex(month_order, fill_value=0)
sns.heatmap(drug_month, annot=True, fmt='d', cmap='YlOrRd', ax=axes[1, 0])
axes[1, 0].set_title('Drug Usage by Month')

# Correlation
df_encoded = df.copy()
df_encoded['Drug_encoded'] = df_encoded['Drug'].map({'Cisplatin': 0, 'Nivolumab': 1})
corr = df_encoded[['Dosage', 'Drug_encoded', 'Month']].corr()
sns.heatmap(corr, annot=True, fmt='.3f', cmap='coolwarm', center=0, ax=axes[1, 1])
axes[1, 1].set_title('Correlation Matrix')

# Scatter plot
scatter = axes[1, 2].scatter(df['Month'], df['Dosage'], c=df_encoded['Drug_encoded'], cmap='viridis', alpha=0.6)
axes[1, 2].set_title('Month vs Dosage (colored by Drug)')
axes[1, 2].set_xlabel('Month')
axes[1, 2].set_ylabel('Dosage')
plt.colorbar(scatter, ax=axes[1, 2])

plt.tight_layout()
plt.show()
