# Healthcare Data Analysis - Understanding Disease Risk

## Project Overview
This notebook provides a comprehensive analysis of healthcare datasets to understand disease risk factors. We'll analyze patterns in health features like blood pressure, cholesterol, sugar levels, BMI, age, and other factors to identify which contribute most to disease risk.

## Table of Contents
1. [Data Loading and Initial Exploration](#1-data-loading-and-initial-exploration)
2. [Data Preprocessing](#2-data-preprocessing)
3. [Exploratory Data Analysis (EDA)](#3-exploratory-data-analysis-eda)
4. [Risk Factor Analysis](#4-risk-factor-analysis)
5. [Statistical Analysis](#5-statistical-analysis)
6. [Insights and Recommendations](#6-insights-and-recommendations)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import requests
import io

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 1. Data Loading and Initial Exploration


In [None]:
def load_healthcare_data():
    """
    Load healthcare dataset from local file or download from UCI repository
    """
    try:
        # Try to load diabetes dataset first
        df = pd.read_csv('diabetes.csv')
        print("✅ Diabetes dataset loaded successfully!")
        return df, 'diabetes'
    except FileNotFoundError:
        try:
            # Try heart disease dataset
            df = pd.read_csv('heart_disease.csv')
            print("✅ Heart disease dataset loaded successfully!")
            return df, 'heart_disease'
        except FileNotFoundError:
            print("📥 No local dataset found. Downloading sample diabetes dataset...")
            
            # Download diabetes dataset from UCI ML Repository
            url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"
            
            # Column names for diabetes dataset
            columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                     'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
            
            try:
                response = requests.get(url)
                df = pd.read_csv(io.StringIO(response.text), names=columns)
                df.to_csv('diabetes.csv', index=False)
                print("✅ Diabetes dataset downloaded and saved successfully!")
                return df, 'diabetes'
            except Exception as e:
                print(f"❌ Error downloading dataset: {e}")
                print("Please ensure you have a healthcare dataset (diabetes.csv or heart_disease.csv) in the project directory.")
                return None, None

# Load the dataset
df, dataset_type = load_healthcare_data()

if df is not None:
    print(f"\n📊 Dataset Type: {dataset_type.title()}")
    print(f"📏 Dataset Shape: {df.shape}")
    print(f"\n🔍 First 5 rows:")
    display(df.head())
else:
    print("❌ Could not load dataset. Please check your data files.")


In [None]:
if df is not None:
    # Basic dataset information
    print("📋 Dataset Information:")
    print(f"• Total Records: {len(df):,}")
    print(f"• Total Features: {len(df.columns)}")
    print(f"• Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"• Missing Values: {df.isnull().sum().sum()}")
    print(f"• Duplicate Records: {df.duplicated().sum()}")
    
    print("\n📊 Data Types:")
    print(df.dtypes)
    
    print("\n📈 Statistical Summary:")
    display(df.describe())


## 2. Data Preprocessing


In [None]:
def preprocess_data(df):
    """
    Comprehensive data preprocessing function
    """
    print("🔧 Starting data preprocessing...")
    
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # 1. Handle missing values
    print("\n1️⃣ Handling missing values...")
    missing_before = df_clean.isnull().sum().sum()
    
    if missing_before > 0:
        print(f"   Found {missing_before} missing values")
        
        # For numerical columns, fill with median
        numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
        for col in numerical_cols:
            if df_clean[col].isnull().sum() > 0:
                median_val = df_clean[col].median()
                df_clean[col].fillna(median_val, inplace=True)
                print(f"   • {col}: filled {df_clean[col].isnull().sum()} missing values with median ({median_val:.2f})")
        
        # For categorical columns, fill with mode
        categorical_cols = df_clean.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            if df_clean[col].isnull().sum() > 0:
                mode_val = df_clean[col].mode()[0] if not df_clean[col].mode().empty else 'Unknown'
                df_clean[col].fillna(mode_val, inplace=True)
                print(f"   • {col}: filled {df_clean[col].isnull().sum()} missing values with mode ({mode_val})")
    else:
        print("   ✅ No missing values found")
    
    # 2. Remove duplicates
    print("\n2️⃣ Removing duplicates...")
    duplicates_before = df_clean.duplicated().sum()
    if duplicates_before > 0:
        df_clean = df_clean.drop_duplicates()
        print(f"   • Removed {duplicates_before} duplicate records")
    else:
        print("   ✅ No duplicates found")
    
    # 3. Handle outliers using IQR method
    print("\n3️⃣ Handling outliers...")
    numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
    outliers_removed = 0
    
    for col in numerical_cols:
        if col in ['Outcome', 'target']:  # Skip target variables
            continue
            
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)]
        if len(outliers) > 0:
            print(f"   • {col}: found {len(outliers)} outliers (IQR method)")
            # Cap outliers instead of removing them
            df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound, df_clean[col])
            df_clean[col] = np.where(df_clean[col] > upper_bound, upper_bound, df_clean[col])
            outliers_removed += len(outliers)
    
    if outliers_removed == 0:
        print("   ✅ No significant outliers found")
    else:
        print(f"   • Capped {outliers_removed} outlier values")
    
    print(f"\n✅ Data preprocessing completed!")
    print(f"   • Final dataset shape: {df_clean.shape}")
    print(f"   • Memory usage reduced to: {df_clean.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    return df_clean

# Apply preprocessing
if df is not None:
    df_processed = preprocess_data(df)
    print("\n📊 Processed dataset info:")
    display(df_processed.info())
else:
    print("❌ Cannot preprocess data - no dataset loaded")


## 3. Exploratory Data Analysis (EDA)


In [None]:
if df_processed is not None:
    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")
    
    # 1. Distribution Analysis
    print("📊 Distribution Analysis")
    print("=" * 50)
    
    # Get numerical columns (excluding target)
    numerical_cols = df_processed.select_dtypes(include=[np.number]).columns
    target_cols = [col for col in numerical_cols if any(t in col.lower() for t in ['outcome', 'target'])]
    feature_cols = [col for col in numerical_cols if col not in target_cols]
    
    # Create distribution plots
    n_cols = 3
    n_rows = (len(feature_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if n_rows == 1:
        axes = axes.reshape(1, -1)
    
    for i, col in enumerate(feature_cols):
        row = i // n_cols
        col_idx = i % n_cols
        
        axes[row, col_idx].hist(df_processed[col].dropna(), bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        axes[row, col_idx].set_title(f'Distribution of {col}')
        axes[row, col_idx].set_xlabel(col)
        axes[row, col_idx].set_ylabel('Frequency')
        axes[row, col_idx].grid(True, alpha=0.3)
    
    # Hide unused subplots
    for i in range(len(feature_cols), n_rows * n_cols):
        row = i // n_cols
        col_idx = i % n_cols
        axes[row, col_idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()
    
    # 2. Correlation Analysis
    print("\n🔗 Correlation Analysis")
    print("=" * 50)
    
    if len(numerical_cols) > 1:
        corr_matrix = df_processed[numerical_cols].corr()
        
        plt.figure(figsize=(12, 8))
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                   square=True, mask=mask, fmt='.2f', cbar_kws={"shrink": .8})
        plt.title('Correlation Matrix of All Features')
        plt.tight_layout()
        plt.show()
        
        # Show top correlations
        print("\nTop 10 Feature Correlations:")
        corr_pairs = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_pairs.append((
                    corr_matrix.columns[i], 
                    corr_matrix.columns[j], 
                    abs(corr_matrix.iloc[i, j])
                ))
        
        corr_pairs.sort(key=lambda x: x[2], reverse=True)
        for i, (col1, col2, corr) in enumerate(corr_pairs[:10]):
            print(f"{i+1:2d}. {col1} ↔ {col2}: {corr:.3f}")
    
    # 3. Box Plots for Outlier Detection
    print("\n📦 Outlier Detection (Box Plots)")
    print("=" * 50)
    
    if len(feature_cols) > 0:
        fig, ax = plt.subplots(figsize=(12, 6))
        df_processed[feature_cols].boxplot(ax=ax)
        ax.set_title('Box Plots for Outlier Detection')
        ax.set_xticklabels(feature_cols, rotation=45)
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
    
    # 4. Target Variable Analysis
    if target_cols:
        target_col = target_cols[0]
        print(f"\n🎯 Target Variable Analysis: {target_col}")
        print("=" * 50)
        
        # Target distribution
        target_counts = df_processed[target_col].value_counts()
        print(f"Target Distribution:")
        for value, count in target_counts.items():
            percentage = (count / len(df_processed)) * 100
            print(f"  {value}: {count} ({percentage:.1f}%)")
        
        # Visualize target distribution
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
        
        # Bar plot
        target_counts.plot(kind='bar', ax=ax1, color=['lightcoral', 'lightblue'])
        ax1.set_title(f'Distribution of {target_col}')
        ax1.set_xlabel(target_col)
        ax1.set_ylabel('Count')
        ax1.grid(True, alpha=0.3)
        
        # Pie chart
        ax2.pie(target_counts.values, labels=target_counts.index, autopct='%1.1f%%', 
                colors=['lightcoral', 'lightblue'])
        ax2.set_title(f'{target_col} Distribution (Percentage)')
        
        plt.tight_layout()
        plt.show()
    
    print("\n✅ EDA completed successfully!")
else:
    print("❌ Cannot perform EDA - no processed dataset available")


## 4. Risk Factor Analysis


In [None]:
if df_processed is not None and target_cols:
    print("🔍 Risk Factor Analysis")
    print("=" * 50)
    
    target_col = target_cols[0]
    feature_cols = [col for col in df_processed.select_dtypes(include=[np.number]).columns 
                   if col not in target_cols]
    
    # 1. Correlation with target
    print("1️⃣ Feature Correlations with Target")
    print("-" * 40)
    
    correlations = df_processed[feature_cols + [target_col]].corr()[target_col].drop(target_col).abs().sort_values(ascending=False)
    
    print("Top Risk Factors (by correlation):")
    for i, (feature, corr) in enumerate(correlations.head(10).items(), 1):
        print(f"{i:2d}. {feature}: {corr:.3f}")
    
    # Visualize correlations
    plt.figure(figsize=(10, 6))
    correlations.plot(kind='barh', color='coral')
    plt.title('Risk Factor Correlations with Target')
    plt.xlabel('Absolute Correlation with Target')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # 2. Detailed analysis for top risk factors
    print("\n2️⃣ Detailed Risk Factor Analysis")
    print("-" * 40)
    
    top_factors = correlations.head(3).index
    
    for i, factor in enumerate(top_factors, 1):
        print(f"\n{i}. {factor} Analysis:")
        print("=" * 30)
        
        # Statistics by target
        stats_by_target = df_processed.groupby(target_col)[factor].describe()
        print("Statistics by Target:")
        display(stats_by_target)
        
        # Risk insights
        mean_0 = df_processed[df_processed[target_col] == 0][factor].mean()
        mean_1 = df_processed[df_processed[target_col] == 1][factor].mean()
        diff = mean_1 - mean_0
        
        print(f"\nRisk Insights:")
        print(f"• Average {factor} for non-disease: {mean_0:.2f}")
        print(f"• Average {factor} for disease: {mean_1:.2f}")
        print(f"• Difference: {diff:.2f}")
        
        if diff > 0:
            print("• Higher values increase disease risk")
        else:
            print("• Lower values increase disease risk")
        
        # Create comparison plots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
        
        # Distribution by target
        target_values = df_processed[target_col].unique()
        colors = ['lightblue', 'lightcoral']
        
        for j, target_val in enumerate(target_values):
            subset = df_processed[df_processed[target_col] == target_val]
            ax1.hist(subset[factor].dropna(), alpha=0.7, 
                    label=f'Target = {target_val}', bins=20, color=colors[j])
        
        ax1.set_title(f'{factor} Distribution by Target')
        ax1.set_xlabel(factor)
        ax1.set_ylabel('Frequency')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Box plot
        df_processed.boxplot(column=factor, by=target_col, ax=ax2)
        ax2.set_title(f'{factor} by Target')
        ax2.set_xlabel('Target')
        ax2.set_ylabel(factor)
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    # 3. Machine Learning Feature Importance
    print("\n3️⃣ Machine Learning Feature Importance")
    print("-" * 40)
    
    try:
        # Prepare data for ML
        X = df_processed[feature_cols]
        y = df_processed[target_col]
        
        # Train Random Forest
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X, y)
        
        # Get feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("Random Forest Feature Importance:")
        display(feature_importance)
        
        # Visualize feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
        plt.title('Random Forest Feature Importance')
        plt.xlabel('Importance Score')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error in ML analysis: {e}")
    
    print("\n✅ Risk factor analysis completed!")
else:
    print("❌ Cannot perform risk analysis - no target variable found")


## 5. Key Insights and Recommendations


In [None]:
if df_processed is not None:
    print("🎯 Key Insights and Recommendations")
    print("=" * 50)
    
    # Get dataset info
    dataset_type = "Diabetes" if 'Outcome' in df_processed.columns else "Heart Disease"
    total_records = len(df_processed)
    
    print(f"📊 Dataset: {dataset_type}")
    print(f"📏 Total Records: {total_records:,}")
    
    # Get target info
    target_cols = [col for col in df_processed.columns if any(t in col.lower() for t in ['outcome', 'target'])]
    if target_cols:
        target_col = target_cols[0]
        disease_rate = (df_processed[target_col].sum() / len(df_processed)) * 100
        print(f"🎯 Disease Rate: {disease_rate:.1f}%")
    
    print("\n🔍 Key Findings:")
    print("-" * 30)
    
    # Data quality insights
    missing_values = df_processed.isnull().sum().sum()
    duplicates = df_processed.duplicated().sum()
    
    print(f"1. Data Quality:")
    print(f"   • Missing values: {missing_values}")
    print(f"   • Duplicate records: {duplicates}")
    print(f"   • Data completeness: {((total_records - missing_values) / total_records) * 100:.1f}%")
    
    # Feature insights
    numerical_cols = df_processed.select_dtypes(include=[np.number]).columns
    feature_cols = [col for col in numerical_cols if col not in target_cols]
    
    print(f"\n2. Feature Analysis:")
    print(f"   • Total features analyzed: {len(feature_cols)}")
    print(f"   • Feature types: All numerical")
    
    # Risk factor insights
    if target_cols:
        correlations = df_processed[feature_cols + [target_col]].corr()[target_col].drop(target_col).abs().sort_values(ascending=False)
        top_risk_factors = correlations.head(3)
        
        print(f"\n3. Top Risk Factors:")
        for i, (factor, corr) in enumerate(top_risk_factors.items(), 1):
            print(f"   {i}. {factor}: {corr:.3f} correlation")
    
    # Statistical insights
    print(f"\n4. Statistical Insights:")
    for col in feature_cols[:3]:  # Show top 3 features
        mean_val = df_processed[col].mean()
        std_val = df_processed[col].std()
        print(f"   • {col}: Mean={mean_val:.2f}, Std={std_val:.2f}")
    
    print(f"\n💡 Recommendations:")
    print("-" * 30)
    print("1. Data Collection:")
    print("   • Ensure regular data collection for all health metrics")
    print("   • Implement data validation to reduce missing values")
    print("   • Consider collecting additional lifestyle factors")
    
    print("\n2. Risk Assessment:")
    print("   • Focus on top risk factors identified in analysis")
    print("   • Implement regular monitoring for high-risk individuals")
    print("   • Develop early warning systems based on key indicators")
    
    print("\n3. Healthcare Strategy:")
    print("   • Create targeted intervention programs")
    print("   • Develop personalized risk profiles")
    print("   • Implement preventive care measures")
    
    print("\n4. Future Analysis:")
    print("   • Consider longitudinal studies for trend analysis")
    print("   • Explore machine learning models for prediction")
    print("   • Include demographic and lifestyle factors")
    
    print(f"\n✅ Analysis completed successfully!")
    print(f"📈 Use the Streamlit dashboard for interactive exploration")
    print(f"📊 Run 'streamlit run app.py' to launch the dashboard")
else:
    print("❌ Cannot generate insights - no processed dataset available")
