# Healthcare Patient Risk Analysis

This notebook demonstrates the use of automated patient risk stratification system using:
1. **Outlier Detection** - Clean medical data
2. **K-Means Clustering** - Identify patient symptom profiles
3. **Classification** - Predict disease risks

In [None]:
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import HeartDiseaseDataLoader
from outlier_detection import OutlierDetector
from clustering import PatientClustering
from classification import DiseaseRiskClassifier
from pipeline import PatientRiskAnalysisPipeline

# Set style
sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Load the heart disease dataset
loader = HeartDiseaseDataLoader()
data = loader.load_data()
data = loader.preprocess()

print(f"Dataset shape: {data.shape}")
print(f"\nFirst few rows:")
data.head()

In [None]:
# Display dataset information
info = loader.get_data_info()
print(f"Target distribution: {info['target_distribution']}")
data.describe()

## 2. Outlier Detection

In [None]:
# Detect and remove outliers using IQR method
detector = OutlierDetector(method='iqr', threshold=1.5)
feature_cols = [col for col in data.columns if col != 'target']

cleaned_data, outlier_stats = detector.fit_transform(data, columns=feature_cols)

print(f"Original size: {outlier_stats['original_size']}")
print(f"Cleaned size: {outlier_stats['cleaned_size']}")
print(f"Outliers removed: {outlier_stats['total_outliers']}")
print(f"Outlier percentage: {outlier_stats['outlier_percentage']:.2f}%")

## 3. Patient Clustering

In [None]:
# Perform K-Means clustering
clustering = PatientClustering(n_clusters=3, random_state=42)
clustering.fit(cleaned_data, feature_columns=feature_cols)

# Get cluster statistics
cluster_stats = clustering.get_cluster_statistics(cleaned_data, feature_columns=feature_cols)
print("Cluster Statistics:")
cluster_stats

In [None]:
# Evaluate clustering
metrics = clustering.evaluate(cleaned_data, feature_columns=feature_cols)
print(f"Silhouette Score: {metrics['silhouette_score']:.4f}")
print(f"Davies-Bouldin Score: {metrics['davies_bouldin_score']:.4f}")
print(f"Inertia: {metrics['inertia']:.2f}")

## 4. Disease Risk Classification

In [None]:
# Train Random Forest classifier
classifier = DiseaseRiskClassifier(algorithm='random_forest', random_state=42)

X_train, X_test, y_train, y_test = classifier.prepare_data(
    cleaned_data, 
    target_column='target',
    test_size=0.2
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

In [None]:
# Train the model
classifier.fit(X_train, y_train)

# Evaluate
metrics = classifier.evaluate(X_test, y_test)
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1-Score: {metrics['f1_score']:.4f}")
if 'roc_auc' in metrics:
    print(f"ROC AUC: {metrics['roc_auc']:.4f}")

In [None]:
# Get feature importance
feature_importance = classifier.get_feature_importance()
print("\nTop 10 Most Important Features:")
feature_importance.head(10)

## 5. Complete Pipeline Example

In [None]:
# Run the complete pipeline
pipeline = PatientRiskAnalysisPipeline(random_state=42)
results = pipeline.run_complete_analysis(
    filepath=None,  # Uses sample data
    n_clusters=3,
    algorithm='random_forest'
)

In [None]:
# Get risk profile for a sample patient
sample_patient = {
    'age': 65, 'sex': 1, 'cp': 3, 'trestbps': 150,
    'chol': 280, 'fbs': 1, 'restecg': 1, 'thalach': 110,
    'exang': 1, 'oldpeak': 3.5, 'slope': 2, 'ca': 2, 'thal': 3
}

profile = pipeline.get_patient_profile(sample_patient)
print("Patient Risk Profile:")
print(f"  Cluster: {profile['cluster']}")
print(f"  Risk Prediction: {'High Risk' if profile['risk_prediction'] == 1 else 'Low Risk'}")
print(f"  Disease Probability: {profile['risk_probability']['disease']:.2%}")

## Summary

This notebook demonstrated a complete patient risk stratification pipeline that:
1. Loads and preprocesses heart disease data
2. Detects and removes outliers to clean the data
3. Identifies patient clusters with similar symptom profiles
4. Predicts disease risk using machine learning classification
5. Provides individual patient risk assessments