In [None]:
# Essential imports for medical AI development
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (classification_report, confusion_matrix, 
                           roc_curve, auc, precision_recall_curve)
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for medical visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("Set2")

print("🏥 Medical AI Diagnosis System Ready!")
print("⚕️  Developing responsible AI for healthcare")
print("🎯 Breast Cancer Wisconsin Dataset Analysis")

# Set random seed for reproducible medical research
np.random.seed(42)


In [None]:
# Load and analyze the Breast Cancer Wisconsin dataset
def load_and_analyze_medical_data():
    """Comprehensive analysis of breast cancer dataset"""
    
    # Load dataset
    data = load_breast_cancer()
    
    # Create DataFrame for easier analysis
    feature_names = data.feature_names
    df = pd.DataFrame(data.data, columns=feature_names)
    df['target'] = data.target
    df['diagnosis'] = df['target'].map({0: 'Malignant', 1: 'Benign'})
    
    print("📋 Dataset Overview:")
    print(f"• Total patients: {len(df)}")
    print(f"• Features: {len(feature_names)}")
    print(f"• Malignant cases: {sum(df['target'] == 0)} ({sum(df['target'] == 0)/len(df)*100:.1f}%)")
    print(f"• Benign cases: {sum(df['target'] == 1)} ({sum(df['target'] == 1)/len(df)*100:.1f}%)")
    
    return df, data

def medical_data_exploration(df):
    """Comprehensive medical data exploration"""
    
    # Basic statistics
    print("\\n📊 Medical Data Analysis:")
    print("=" * 80)
    
    # Check for missing values (critical in medical data)
    missing_values = df.isnull().sum().sum()
    print(f"Missing values: {missing_values} (Data completeness: {(1-missing_values/len(df))*100:.1f}%)")
    
    # Feature groups (mean, se, worst for each measurement)
    feature_groups = {
        'Radius': ['radius_mean', 'radius_se', 'radius_worst'],
        'Texture': ['texture_mean', 'texture_se', 'texture_worst'],
        'Perimeter': ['perimeter_mean', 'perimeter_se', 'perimeter_worst'],
        'Area': ['area_mean', 'area_se', 'area_worst'],
        'Smoothness': ['smoothness_mean', 'smoothness_se', 'smoothness_worst'],
        'Compactness': ['compactness_mean', 'compactness_se', 'compactness_worst'],
        'Concavity': ['concavity_mean', 'concavity_se', 'concavity_worst'],
        'Concave Points': ['concave points_mean', 'concave points_se', 'concave points_worst'],
        'Symmetry': ['symmetry_mean', 'symmetry_se', 'symmetry_worst'],
        'Fractal Dimension': ['fractal_dimension_mean', 'fractal_dimension_se', 'fractal_dimension_worst']
    }
    
    return feature_groups

def visualize_medical_features(df, feature_groups):
    """Visualize key medical features"""
    
    # Class distribution
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Breast Cancer Dataset: Medical Feature Analysis', fontsize=16, fontweight='bold')
    
    # Target distribution
    df['diagnosis'].value_counts().plot(kind='bar', ax=axes[0, 0], color=['lightcoral', 'lightblue'])
    axes[0, 0].set_title('Diagnosis Distribution')
    axes[0, 0].set_ylabel('Number of Cases')
    axes[0, 0].tick_params(axis='x', rotation=0)
    
    # Key feature distributions by diagnosis
    key_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean']
    
    for idx, feature in enumerate(key_features[:4]):
        if idx < 4:
            row = (idx + 1) // 3
            col = (idx + 1) % 3
            
            # Box plot for each diagnosis
            df.boxplot(column=feature, by='diagnosis', ax=axes[row, col])
            axes[row, col].set_title(f'{feature.replace("_", " ").title()}')
            axes[row, col].set_xlabel('Diagnosis')
            
    # Remove the extra subplot
    fig.delaxes(axes[1, 2])
    
    plt.tight_layout()
    plt.show()
    
    # Correlation heatmap for key features
    key_feature_cols = [f for group in list(feature_groups.values())[:5] for f in group if f in df.columns]
    correlation_matrix = df[key_feature_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0,
                square=True, linewidths=0.5)
    plt.title('Feature Correlation Matrix (Key Medical Features)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

def statistical_analysis(df, feature_groups):
    """Statistical analysis for medical significance"""
    
    print("\\n🔬 Statistical Analysis for Clinical Significance:")
    print("=" * 80)
    
    malignant = df[df['target'] == 0]
    benign = df[df['target'] == 1]
    
    # Perform t-tests for key features
    significant_features = []
    
    for group_name, features in feature_groups.items():
        for feature in features:
            if feature in df.columns:
                mal_values = malignant[feature]
                ben_values = benign[feature]
                
                # Perform independent t-test
                t_stat, p_value = stats.ttest_ind(mal_values, ben_values)
                
                # Calculate effect size (Cohen's d)
                pooled_std = np.sqrt(((len(mal_values) - 1) * np.var(mal_values, ddof=1) + 
                                    (len(ben_values) - 1) * np.var(ben_values, ddof=1)) / 
                                   (len(mal_values) + len(ben_values) - 2))
                
                cohens_d = (np.mean(mal_values) - np.mean(ben_values)) / pooled_std
                
                if p_value < 0.001:  # Highly significant
                    significant_features.append({
                        'feature': feature,
                        'p_value': p_value,
                        'effect_size': abs(cohens_d),
                        'malignant_mean': np.mean(mal_values),
                        'benign_mean': np.mean(ben_values)
                    })
    
    # Sort by effect size
    significant_features.sort(key=lambda x: x['effect_size'], reverse=True)
    
    print(f"{'Feature':<25} | {'P-value':<10} | {'Effect Size':<12} | {'Clinical Relevance':<15}")
    print("-" * 80)
    
    for feat in significant_features[:10]:  # Top 10 most significant
        clinical_relevance = "Very High" if feat['effect_size'] > 1.5 else "High" if feat['effect_size'] > 0.8 else "Moderate"
        print(f"{feat['feature']:<25} | {feat['p_value']:<10.2e} | {feat['effect_size']:<12.3f} | {clinical_relevance:<15}")
    
    return significant_features

# Load and analyze the dataset
print("🔍 Loading Breast Cancer Wisconsin Dataset...")
df, data = load_and_analyze_medical_data()

# Explore the medical data
feature_groups = medical_data_exploration(df)

# Visualize medical features
visualize_medical_features(df, feature_groups)

# Statistical analysis
significant_features = statistical_analysis(df, feature_groups)

print("\\n💡 Medical Insights:")
print("• Dataset is well-balanced with no missing values")
print("• Multiple features show strong statistical significance")
print("• Large effect sizes indicate clear biological differences")
print("• Feature correlations suggest underlying biological relationships")
