In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Data validation & preprocessing
import sklearn
import pandera as pa
from pandera import Column, DataFrameSchema
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Advanced feature engineering
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# SMOTE balancing
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# MLOps & tracking
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

# Performance monitoring
import time
import psutil
import gc

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['font.size']=6
plt.rcParams['font.family']='serif'
plt.rcParams['font.weight']='bold'
plt.rcParams['figure.figsize']=(10,5)
plt.rcParams['figure.dpi']=300
plt.rcParams['savefig.dpi']=300

print("✅ All libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"�� NumPy version: {np.__version__}")
print(f"�� Matplotlib version: {plt.matplotlib.__version__}")
print(f"�� Pandera version: {pa.__version__}")
print(f"🤖 Scikit-learn version: {sklearn.__version__}")
print(f"📊 MLflow version: {mlflow.__version__}")

# Initialize MLflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("diabetic_readmission_pipeline")

print(f"�� MLflow experiment: {mlflow.get_experiment_by_name('diabetic_readmission_pipeline')}")

# Performance monitoring setup
start_time = time.time()
initial_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB

print(f"\n📊 Performance Monitoring:")
print(f"   • Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"   • Initial memory usage: {initial_memory:.2f} MB")
print(f"   • CPU cores available: {psutil.cpu_count()}")

In [None]:
file_path="/Users/javadbeni/Desktop/Diabetes_Phase1_1/diabetic_data.csv"

df=pd.read_csv(file_path)

# df.head()

df.info()

df.describe()
df.isnull().sum().sort_values(ascending=False)
df.shape


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
missing_data=df.isnull().sum().sort_values(ascending=False)
missing_percent=(missing_data/len(df))*100
missing_summary=pd.DataFrame({
    'Column':missing_data.index,
    'Missing_Count':missing_data.values,
    'Missing_Percent':missing_percent.values,
    'Data_Type':df.dtypes.values

})
print(missing_summary[missing_summary.Missing_Percent>0])



In [None]:
# Create a figure with two subplots
plt.figure(figsize=(15, 8))

# Subplot 1: Missing percentage
plt.subplot(1, 2, 1)
missing_summary[missing_summary['Missing_Percent'] > 0]['Missing_Percent'].plot(kind='bar')
plt.title('Missing Values by Column (%)')
plt.xlabel('Columns')
plt.ylabel('Missing Percentage')
plt.xticks(rotation=45, ha='right')

# Subplot 2: Missing count
plt.subplot(1, 2, 2)
missing_summary[missing_summary['Missing_Percent'] > 0]['Missing_Count'].plot(kind='bar')
plt.title('Missing Values by Column (Count)')
plt.xlabel('Columns')
plt.ylabel('Missing Count')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
#Show Data types
print('Data Type Summary')
print(df.dtypes.value_counts())

In [None]:
#Check for Mixed data types in object columns
print('Potential Data types issues')

mixed_type_columns=[]
for col in df.select_dtypes(include=['object']).columns:
    unique_types=df[col].dropna().apply(type).nunique()
    if unique_types>1:
        mixed_type_columns.append(col)
        print(f'⚠️ {col}:{unique_types} different types detected')
if not mixed_type_columns:
    print('✅ No mixed type columns detected')

In [None]:
#Check for Numerical Columns might be Categorical
numeric_cols=df.select_dtypes(include=['int64','float64']).columns

for col in numeric_cols:
    unique_val=df[col].nunique()
    if unique_val<20:
        print(f' ⚠️ {col}:{unique_val} unique values , it might be potential categorical')

In [None]:
#Check Categorical Columns
categorical_cols=df.select_dtypes(include=['object']).columns
print(f'Found {len(categorical_cols)} categorical columns')
for col in categorical_cols:
    unique_vals=df[col].nunique()
    print(f'{col}:{unique_vals} unique values')
    if unique_vals>=20:
        print(f' ⚠️ {col}:{unique_vals} high cardinality , encoding might be needed')


In [None]:
#Missing Values Correlation Analysis with the Target Variable
print('Missing Values Correlation Matrix ( Columns with >5% missings)')
high_missing_cols=missing_summary[missing_summary.Missing_Percent>5]['Column'].tolist()

if len(high_missing_cols)>0:
    missing_corr=df[high_missing_cols].isnull().corr()
    print(missing_corr)

    #Visualize the Correlation Matrix
    plt.figure(figsize=(10,5))
    sns.heatmap(missing_corr,annot=True,cmap='coolwarm',center=0)
    plt.title('Columns with Missing Values Correlation Matrix')
    plt.tight_layout()
    plt.show()
else:
    print('✅ No columns with missing values')

In [None]:
print("\n�� 6. SUMMARY AND RECOMMENDATIONS")
print("-" * 50)

print("🔍 Key Findings:")
print(f"   • Total rows: {len(df):,}")
print(f"   • Total columns: {len(df.columns)}")
print(f"   • Columns with missing values: {len(missing_summary[missing_summary['Missing_Percent'] > 0])}")


print("\n�� Critical Issues to Address:")
critical_missing = missing_summary[missing_summary['Missing_Percent'] > 20]
if not critical_missing.empty:
    for _, row in critical_missing.iterrows():
        print(f"   • {row['Column']}: {row['Missing_Percent']:.1f}% missing")
else:
    print("   • No critical missing value issues (>20%)")

print("\n💡 Recommendations:")
print("   • Consider imputation strategies for columns with <20% missing")
print("   • Investigate high cardinality categorical variables")
print("   • Plan encoding strategies for categorical variables")
print("   • Monitor memory usage during feature engineering")

In [None]:
# 🎯 TARGET VARIABLE CREATION
print("🎯 CREATING TARGET VARIABLE")
print("=" * 50)

# Check current readmission distribution
print("Current readmission distribution:")
print(df['readmitted'].value_counts())
print("\nDetailed breakdown:")
print(df['readmitted'].value_counts(normalize=True) * 100)

# Create binary target variable for 30-day readmission
df['readmission_30d'] = (df['readmitted'] == '<30').astype(int)

# Verify target creation
print(f"\n✅ Target variable 'readmission_30d' created:")
print(f"   • 0 (No readmission): {(df['readmission_30d'] == 0).sum():,} patients")
print(f"   • 1 (Readmission <30 days): {(df['readmission_30d'] == 1).sum():,} patients")
print(f"   • Readmission rate: {(df['readmission_30d'] == 1).mean() * 100:.2f}%")

# Check for any missing values in target
if df['readmission_30d'].isnull().sum() > 0:
    print(f"⚠️ Warning: {df['readmission_30d'].isnull().sum()} missing values in target")
else:
    print("✅ No missing values in target variable")

# Save target variable info for later use
target_info = {
    'total_patients': len(df),
    'readmission_count': df['readmission_30d'].sum(),
    'readmission_rate': df['readmission_30d'].mean(),
    'class_balance': 'Imbalanced' if df['readmission_30d'].mean() < 0.1 else 'Balanced'
}

print(f"\n📊 Target Variable Summary:")
print(f"   • Class balance: {target_info['class_balance']}")
print(f"   • Imbalance ratio: 1:{int((1-target_info['readmission_rate'])/target_info['readmission_rate'])}")

In [None]:
# 🏥 CLINICAL RISK STRATIFICATION
print("🏥 CLINICAL RISK STRATIFICATION")
print("=" * 50)

# Create clinical risk categories based on number of diagnoses
df['clinical_risk'] = pd.cut(df['number_diagnoses'], 
                            bins=[0, 3, 6, 10, 100], 
                            labels=['Low', 'Medium', 'High', 'Critical'])

# Analyze readmission by clinical risk
risk_analysis = df.groupby(['clinical_risk', 'readmission_30d']).size().unstack(fill_value=0)
risk_analysis['total'] = risk_analysis.sum(axis=1)
risk_analysis['readmission_rate'] = (risk_analysis[1] / risk_analysis['total'] * 100).round(2)

print("Readmission Analysis by Clinical Risk:")
print(risk_analysis)

# Visualize the relationship
plt.figure(figsize=(15, 6))

# Subplot 1: Count by risk category
plt.subplot(1, 3, 1)
risk_analysis[['total']].plot(kind='bar', color='skyblue', ax=plt.gca())
plt.title('Patient Count by Clinical Risk Category', fontweight='bold')
plt.xlabel('Clinical Risk Category')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# Subplot 2: Readmission rate by risk category
plt.subplot(1, 3, 2)
risk_analysis['readmission_rate'].plot(kind='bar', color='coral', ax=plt.gca())
plt.title('Readmission Rate by Clinical Risk Category', fontweight='bold')
plt.xlabel('Clinical Risk Category')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# Subplot 3: Stacked bar chart showing readmission vs no readmission
plt.subplot(1, 3, 3)
risk_analysis[[0, 1]].plot(kind='bar', stacked=True, ax=plt.gca())
plt.title('Readmission Status by Clinical Risk Category', fontweight='bold')
plt.xlabel('Clinical Risk Category')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)
plt.legend(['No Readmission', 'Readmission <30 days'])

plt.tight_layout()
plt.show()

print(f"\n✅ Clinical risk stratification completed:")
print(f"   • Low risk (1-3 diagnoses): {risk_analysis.loc['Low', 'total']:,} patients")
print(f"   • Medium risk (4-6 diagnoses): {risk_analysis.loc['Medium', 'total']:,} patients")
print(f"   • High risk (7-10 diagnoses): {risk_analysis.loc['High', 'total']:,} patients")
print(f"   • Critical risk (11+ diagnoses): {risk_analysis.loc['Critical', 'total']:,} patients")

# Save risk analysis for later use
clinical_risk_summary = {
    'low_risk_patients': risk_analysis.loc['Low', 'total'],
    'medium_risk_patients': risk_analysis.loc['Medium', 'total'],
    'high_risk_patients': risk_analysis.loc['High', 'total'],
    'critical_risk_patients': risk_analysis.loc['Critical', 'total'],
    'highest_readmission_rate': risk_analysis['readmission_rate'].max(),
    'highest_readmission_category': risk_analysis['readmission_rate'].idxmax()
}

In [None]:
# 💊 TREATMENT COMPLEXITY ANALYSIS
print("💊 TREATMENT COMPLEXITY ANALYSIS")
print("=" * 50)

# Create treatment complexity score
df['treatment_complexity'] = (
    df['num_procedures'] * 0.3 +
    df['num_medications'] * 0.4 +
    df['number_diagnoses'] * 0.3
)

# Categorize treatment complexity
df['complexity_level'] = pd.cut(df['treatment_complexity'], 
                               bins=[0, 2, 4, 6, 20], 
                               labels=['Low', 'Medium', 'High', 'Critical'])

# Analyze readmission by treatment complexity
complexity_analysis = df.groupby(['complexity_level', 'readmission_30d']).size().unstack(fill_value=0)
complexity_analysis['total'] = complexity_analysis.sum(axis=1)
complexity_analysis['readmission_rate'] = (complexity_analysis[1] / complexity_analysis['total'] * 100).round(2)

print("Readmission Analysis by Treatment Complexity:")
print(complexity_analysis)

# Visualize treatment complexity distribution
plt.figure(figsize=(18, 6))

# Subplot 1: Treatment complexity score distribution
plt.subplot(1, 4, 1)
plt.hist(df['treatment_complexity'], bins=30, color='lightgreen', alpha=0.7, edgecolor='black')
plt.title('Distribution of Treatment Complexity Score', fontweight='bold')
plt.xlabel('Treatment Complexity Score')
plt.ylabel('Frequency')

# Subplot 2: Readmission rate by complexity level
plt.subplot(1, 4, 2)
complexity_analysis['readmission_rate'].plot(kind='bar', color='orange')
plt.title('Readmission Rate by Complexity Level', fontweight='bold')
plt.xlabel('Complexity Level')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# Subplot 3: Box plot of complexity by readmission status
plt.subplot(1, 4, 3)
df.boxplot(column='treatment_complexity', by='readmission_30d', ax=plt.gca())
plt.title('Treatment Complexity by Readmission Status', fontweight='bold')
plt.suptitle('')  # Remove default title

# Subplot 4: Scatter plot of complexity vs readmission
plt.subplot(1, 4, 4)
plt.scatter(df[df['readmission_30d']==0]['treatment_complexity'], 
           df[df['readmission_30d']==0]['number_diagnoses'], 
           alpha=0.6, label='No Readmission', color='blue')
plt.scatter(df[df['readmission_30d']==1]['treatment_complexity'], 
           df[df['readmission_30d']==1]['number_diagnoses'], 
           alpha=0.6, label='Readmission <30 days', color='red')
plt.xlabel('Treatment Complexity Score')
plt.ylabel('Number of Diagnoses')
plt.title('Complexity vs Diagnoses by Readmission Status', fontweight='bold')
plt.legend()

plt.tight_layout()
plt.show()

print(f"\n✅ Treatment complexity analysis completed:")
print(f"   • Average complexity score: {df['treatment_complexity'].mean():.2f}")
print(f"   • Complexity range: {df['treatment_complexity'].min():.1f} - {df['treatment_complexity'].max():.1f}")
print(f"   • Standard deviation: {df['treatment_complexity'].std():.2f}")

# Save complexity analysis for later use
complexity_summary = {
    'avg_complexity': df['treatment_complexity'].mean(),
    'min_complexity': df['treatment_complexity'].min(),
    'max_complexity': df['treatment_complexity'].max(),
    'std_complexity': df['treatment_complexity'].std(),
    'highest_complexity_readmission_rate': complexity_analysis.loc['Critical', 'readmission_rate']
}

In [None]:
# 💰 INSURANCE & SOCIOECONOMIC ANALYSIS
print("💰 INSURANCE & SOCIOECONOMIC ANALYSIS")
print("=" * 50)

# Analyze readmission by insurance type
insurance_analysis = df.groupby(['payer_code', 'readmission_30d']).size().unstack(fill_value=0)
insurance_analysis['total'] = insurance_analysis.sum(axis=1)
insurance_analysis['readmission_rate'] = (insurance_analysis[1] / insurance_analysis['total'] * 100).round(2)

# Sort by readmission rate for better insights
insurance_analysis = insurance_analysis.sort_values('readmission_rate', ascending=False)

print("Top 15 Insurance Types by Readmission Rate:")
print(insurance_analysis.head(15))

# Create socioeconomic risk score
df['socioeconomic_risk'] = 0

# Add risk points based on various factors
df.loc[df['payer_code'] == 'MC', 'socioeconomic_risk'] += 2      # Medicaid
df.loc[df['payer_code'] == 'MD', 'socioeconomic_risk'] += 1      # Medicare
df.loc[df['race'] == 'AfricanAmerican', 'socioeconomic_risk'] += 1
df.loc[df['age'] == '?', 'socioeconomic_risk'] += 1              # Unknown age
df.loc[df['weight'] == '?', 'socioeconomic_risk'] += 1            # Unknown weight

# Categorize socioeconomic risk
df['socioeconomic_level'] = pd.cut(df['socioeconomic_risk'], 
                                  bins=[0, 1, 2, 3, 10], 
                                  labels=['Low', 'Medium', 'High', 'Critical'])

# Analyze readmission by socioeconomic risk
socio_analysis = df.groupby(['socioeconomic_level', 'readmission_30d']).size().unstack(fill_value=0)
socio_analysis['total'] = socio_analysis.sum(axis=1)
socio_analysis['readmission_rate'] = (socio_analysis[1] / socio_analysis['total'] * 100).round(2)

print("\nReadmission Analysis by Socioeconomic Risk:")
print(socio_analysis)

# Visualize socioeconomic analysis
plt.figure(figsize=(18, 6))

# Subplot 1: Insurance readmission rates (top 15)
plt.subplot(1, 4, 1)
top_insurance = insurance_analysis.head(15)
top_insurance['readmission_rate'].plot(kind='bar', color='purple')
plt.title('Top 15 Insurance Types by Readmission Rate', fontweight='bold')
plt.xlabel('Insurance Type')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# Subplot 2: Socioeconomic risk distribution
plt.subplot(1, 4, 2)
df['socioeconomic_risk'].value_counts().sort_index().plot(kind='bar', color='brown')
plt.title('Distribution of Socioeconomic Risk Scores', fontweight='bold')
plt.xlabel('Risk Score')
plt.ylabel('Number of Patients')

# Subplot 3: Readmission rate by socioeconomic level
plt.subplot(1, 4, 3)
socio_analysis['readmission_rate'].plot(kind='bar', color='red')
plt.title('Readmission Rate by Socioeconomic Risk Level', fontweight='bold')
plt.xlabel('Socioeconomic Risk Level')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# Subplot 4: Race analysis
plt.subplot(1, 4, 4)
race_analysis = df.groupby(['race', 'readmission_30d']).size().unstack(fill_value=0)
race_analysis['total'] = race_analysis.sum(axis=1)
race_analysis['readmission_rate'] = (race_analysis[1] / race_analysis['total'] * 100).round(2)
race_analysis['readmission_rate'].plot(kind='bar', color='green')
plt.title('Readmission Rate by Race', fontweight='bold')
plt.xlabel('Race')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print(f"\n✅ Socioeconomic analysis completed:")
print(f"   • Average socioeconomic risk score: {df['socioeconomic_risk'].mean():.2f}")
print(f"   • Risk score range: {df['socioeconomic_risk'].min()} - {df['socioeconomic_risk'].max()}")

# Save socioeconomic analysis for later use
socioeconomic_summary = {
    'avg_risk_score': df['socioeconomic_risk'].mean(),
    'min_risk_score': df['socioeconomic_risk'].min(),
    'max_risk_score': df['socioeconomic_risk'].max(),
    'highest_risk_readmission_rate': socio_analysis.loc['Critical', 'readmission_rate'],
    'medicaid_patients': (df['payer_code'] == 'MC').sum(),
    'medicare_patients': (df['payer_code'] == 'MD').sum()
}

In [None]:
# 🚀 ADVANCED FEATURE ENGINEERING
print("🚀 ADVANCED FEATURE ENGINEERING")
print("=" * 50)

# 1. Medication adherence score
df['medication_adherence'] = df['num_medications'] / (df['number_diagnoses'] + 1)
df['medication_adherence'] = df['medication_adherence'].clip(0, 5)

# 2. Hospital utilization score
df['hospital_utilization'] = (
    df['number_outpatient'] * 0.3 +
    df['number_emergency'] * 0.4 +
    df['number_inpatient'] * 0.3
)

# 3. Lab procedure efficiency
df['lab_efficiency'] = df['num_lab_procedures'] / (df['time_in_hospital'] + 1)

# 4. Age group categorization - FIXED for your data format
print("Processing age groups...")
# First, let's see what's in the age column
print("Unique age values:", df['age'].unique())

# Create age groups based on the string format in your data
def categorize_age(age_str):
    if age_str == '?':
        return 'Unknown'
    elif age_str == '[0-10)':
        return 'Young'
    elif age_str == '[10-20)':
        return 'Young'
    elif age_str == '[20-30)':
        return 'Young'
    elif age_str == '[30-40)':
        return 'Middle'
    elif age_str == '[40-50)':
        return 'Middle'
    elif age_str == '[50-60)':
        return 'Senior'
    elif age_str == '[60-70)':
        return 'Senior'
    elif age_str == '[70-80)':
        return 'Elderly'
    elif age_str == '[80-90)':
        return 'Elderly'
    elif age_str == '[90-100)':
        return 'Elderly'
    else:
        return 'Unknown'

df['age_group'] = df['age'].apply(categorize_age)

# 5. Length of stay risk
df['los_risk'] = pd.cut(df['time_in_hospital'], 
                        bins=[0, 3, 7, 14, 30], 
                        labels=['Low', 'Medium', 'High', 'Critical'])

# 6. Diagnosis complexity - FIXED for string handling
df['diagnosis_complexity'] = (
    (df['diag_1'].astype(str).str.len() > 3).astype(int) * 0.4 +
    (df['diag_2'].astype(str).str.len() > 3).astype(int) * 0.3 +
    (df['diag_3'].astype(str).str.len() > 3).astype(int) * 0.3
)

# 7. Insurance-age interaction
df['insurance_age_risk'] = df['payer_code'].astype(str) + '_' + df['age'].astype(str)

# 8. Clinical severity index
df['clinical_severity'] = (
    df['number_diagnoses'] * 0.3 +
    df['num_procedures'] * 0.2 +
    df['num_medications'] * 0.2 +
    df['time_in_hospital'] * 0.3
)

# Categorize clinical severity
df['severity_level'] = pd.cut(df['clinical_severity'], 
                             bins=[0, 5, 10, 15, 50], 
                             labels=['Mild', 'Moderate', 'Severe', 'Critical'])

print("✅ Advanced features created:")
print(f"   • Medication adherence score: {df['medication_adherence'].mean():.2f}")
print(f"   • Hospital utilization score: {df['hospital_utilization'].mean():.2f}")
print(f"   • Lab efficiency score: {df['lab_efficiency'].mean():.2f}")
print(f"   • Clinical severity score: {df['clinical_severity'].mean():.2f}")

# Analyze new features
print("\n📊 Analysis of New Features:")

# Clinical severity analysis
severity_analysis = df.groupby(['severity_level', 'readmission_30d']).size().unstack(fill_value=0)
severity_analysis['total'] = severity_analysis.sum(axis=1)
severity_analysis['readmission_rate'] = (severity_analysis[1] / severity_analysis['total'] * 100).round(2)

print("\nReadmission Rate by Clinical Severity:")
print(severity_analysis)

# Age group analysis
age_analysis = df.groupby(['age_group', 'readmission_30d']).size().unstack(fill_value=0)
age_analysis['total'] = age_analysis.sum(axis=1)
age_analysis['readmission_rate'] = (age_analysis[1] / age_analysis['total'] * 100).round(2)

print("\nReadmission Rate by Age Group:")
print(age_analysis)

# Visualize new features
plt.figure(figsize=(20, 10))

# Subplot 1: Clinical severity distribution
plt.subplot(2, 4, 1)
df['severity_level'].value_counts().plot(kind='bar', color='darkblue')
plt.title('Distribution of Clinical Severity Levels', fontweight='bold')
plt.xlabel('Severity Level')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# Subplot 2: Readmission rate by severity
plt.subplot(2, 4, 2)
severity_analysis['readmission_rate'].plot(kind='bar', color='darkred')
plt.title('Readmission Rate by Clinical Severity', fontweight='bold')
plt.xlabel('Severity Level')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# Subplot 3: Medication adherence distribution
plt.subplot(2, 4, 3)
plt.hist(df['medication_adherence'], bins=30, color='lightblue', alpha=0.7, edgecolor='black')
plt.title('Distribution of Medication Adherence Score', fontweight='bold')
plt.xlabel('Medication Adherence Score')
plt.ylabel('Frequency')

# Subplot 4: Hospital utilization by readmission status
plt.subplot(2, 4, 4)
df.boxplot(column='hospital_utilization', by='readmission_30d', ax=plt.gca())
plt.title('Hospital Utilization by Readmission Status', fontweight='bold')
plt.suptitle('')

# Subplot 5: Lab efficiency distribution
plt.subplot(2, 4, 5)
plt.hist(df['lab_efficiency'], bins=30, color='lightgreen', alpha=0.7, edgecolor='black')
plt.title('Distribution of Lab Efficiency Score', fontweight='bold')
plt.xlabel('Lab Efficiency Score')
plt.ylabel('Frequency')

# Subplot 6: Age group analysis
plt.subplot(2, 4, 6)
age_analysis['readmission_rate'].plot(kind='bar', color='orange')
plt.title('Readmission Rate by Age Group', fontweight='bold')
plt.xlabel('Age Group')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# Subplot 7: Length of stay risk analysis
plt.subplot(2, 4, 7)
los_analysis = df.groupby(['los_risk', 'readmission_30d']).size().unstack(fill_value=0)
los_analysis['total'] = los_analysis.sum(axis=1)
los_analysis['readmission_rate'] = (los_analysis[1] / los_analysis['total'] * 100).round(2)
los_analysis['readmission_rate'].plot(kind='bar', color='purple')
plt.title('Readmission Rate by Length of Stay Risk', fontweight='bold')
plt.xlabel('Length of Stay Risk')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# Subplot 8: Diagnosis complexity analysis
plt.subplot(2, 4, 8)
plt.hist(df['diagnosis_complexity'], bins=10, color='lightcoral', alpha=0.7, edgecolor='black')
plt.title('Distribution of Diagnosis Complexity', fontweight='bold')
plt.xlabel('Diagnosis Complexity Score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Save feature engineering summary
feature_engineering_summary = {
    'medication_adherence_mean': df['medication_adherence'].mean(),
    'hospital_utilization_mean': df['hospital_utilization'].mean(),
    'lab_efficiency_mean': df['lab_efficiency'].mean(),
    'clinical_severity_mean': df['clinical_severity'].mean(),
    'diagnosis_complexity_mean': df['diagnosis_complexity'].mean(),
    'highest_severity_readmission_rate': severity_analysis.loc['Critical', 'readmission_rate']
}

print(f"\n✅ Feature engineering completed successfully!")
print(f"📊 Created 8 new clinical features for modeling!")

In [None]:
# 📋 COMPREHENSIVE SUMMARY & NEXT STEPS
print("📋 COMPREHENSIVE SUMMARY & NEXT STEPS")
print("=" * 60)

print("🎯 TARGET VARIABLE SUMMARY:")
print(f"   • Total patients: {target_info['total_patients']:,}")
print(f"   • Readmission rate: {target_info['readmission_rate']*100:.2f}%")
print(f"   • Class balance: {target_info['class_balance']}")

print("\n🏥 CLINICAL RISK SUMMARY:")
print(f"   • Critical risk patients: {clinical_risk_summary['critical_risk_patients']:,}")
print(f"   • Highest readmission rate: {clinical_risk_summary['highest_readmission_rate']:.2f}%")
print(f"   • Risk category with highest rate: {clinical_risk_summary['highest_readmission_category']}")

print("\n💊 TREATMENT COMPLEXITY SUMMARY:")
print(f"   • Average complexity score: {complexity_summary['avg_complexity']:.2f}")
print(f"   • Complexity range: {complexity_summary['min_complexity']:.1f} - {complexity_summary['max_complexity']:.1f}")
print(f"   • Critical complexity readmission rate: {complexity_summary['highest_complexity_readmission_rate']:.2f}%")

print("\n💰 SOCIOECONOMIC SUMMARY:")
print(f"   • Average risk score: {socioeconomic_summary['avg_risk_score']:.2f}")
print(f"   • Medicaid patients: {socioeconomic_summary['medicaid_patients']:,}")
print(f"   • Medicare patients: {socioeconomic_summary['medicare_patients']:,}")
print(f"   • Highest risk readmission rate: {socioeconomic_summary['highest_risk_readmission_rate']:.2f}%")

print("\n🚀 FEATURE ENGINEERING SUMMARY:")
print(f"   • Medication adherence: {feature_engineering_summary['medication_adherence_mean']:.2f}")
print(f"   • Hospital utilization: {feature_engineering_summary['hospital_utilization_mean']:.2f}")
print(f"   • Lab efficiency: {feature_engineering_summary['lab_efficiency_mean']:.2f}")
print(f"   • Clinical severity: {feature_engineering_summary['clinical_severity_mean']:.2f}")
print(f"   • Diagnosis complexity: {feature_engineering_summary['diagnosis_complexity_mean']:.2f}")

print("\n" + "=" * 60)
print("🎯 NEXT STEPS FOR WEEK 2:")
print("1. Feature Selection: Choose top 20-30 features for modeling")
print("2. Data Preprocessing: Handle missing values and encode categorical variables")
print("3. Baseline Models: Train Logistic Regression, Random Forest, XGBoost")
print("4. Hyperparameter Tuning: Use Optuna for optimization")
print("5. Model Evaluation: Compare performance and interpretability")
print("6. Feature Importance: Analyze SHAP values for clinical insights")

print("\n💡 KEY INSIGHTS FOR MODELING:")
print("• Focus on clinical risk and treatment complexity features")
print("• Consider socioeconomic factors for bias detection")
print("• Use medication adherence and hospital utilization patterns")
print("• Implement proper cross-validation for imbalanced data")
print("• Monitor for data leakage in temporal features")

# Save all summaries for later use
eda_summary = {
    'target_info': target_info,
    'clinical_risk_summary': clinical_risk_summary,
    'complexity_summary': complexity_summary,
    'socioeconomic_summary': socioeconomic_summary,
    'feature_engineering_summary': feature_engineering_summary
}

print(f"\n✅ EDA Phase 1.2 completed successfully!")
print(f"📊 Ready for feature engineering and baseline model training!")

In [None]:


# Set professional color palette
professional_colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#3A1772', '#6B5B95']

# Create a professional summary dashboard with PROPER SPACING
fig, axes = plt.subplots(2, 3, figsize=(24, 16))  # Increased size for better spacing
fig.suptitle('Diabetes Readmission Prediction - Clinical Insights Dashboard', 
             fontsize=20, fontweight='bold', y=0.98)

# Plot 1: Target Distribution
axes[0, 0].pie([target_info['readmission_count'], 
                target_info['total_patients'] - target_info['readmission_count']], 
                labels=['Readmission <30 days', 'No Readmission'], 
                autopct='%1.1f%%', colors=['#C73E1D', '#2E86AB'],
                textprops={'fontsize': 12})
axes[0, 0].set_title('Readmission Rate Distribution', fontsize=16, fontweight='bold', pad=20)

# Plot 2: Clinical Risk Analysis
risk_analysis['readmission_rate'].plot(kind='bar', ax=axes[0, 1], 
                                      color=professional_colors[:4])
axes[0, 1].set_title('Readmission Rate by Clinical Risk', fontsize=16, fontweight='bold', pad=20)
axes[0, 1].set_xlabel('Clinical Risk Category', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Readmission Rate (%)', fontsize=14, fontweight='bold')
axes[0, 1].tick_params(axis='x', rotation=45, labelsize=12)
axes[0, 1].tick_params(axis='y', labelsize=12)

# Plot 3: Treatment Complexity
complexity_analysis['readmission_rate'].plot(kind='bar', ax=axes[0, 2], 
                                           color=professional_colors[1:5])
axes[0, 2].set_title('Readmission Rate by Treatment Complexity', fontsize=16, fontweight='bold', pad=20)
axes[0, 2].set_xlabel('Complexity Level', fontsize=14, fontweight='bold')
axes[0, 2].set_ylabel('Readmission Rate (%)', fontsize=14, fontweight='bold')
axes[0, 2].tick_params(axis='x', rotation=45, labelsize=12)
axes[0, 2].tick_params(axis='y', labelsize=12)

# Plot 4: Age Group Analysis
age_analysis['readmission_rate'].plot(kind='bar', ax=axes[1, 0], 
                                     color=professional_colors[2:6])
axes[1, 0].set_title('Readmission Rate by Age Group', fontsize=16, fontweight='bold', pad=20)
axes[1, 0].set_xlabel('Age Group', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Readmission Rate (%)', fontsize=14, fontweight='bold')
axes[1, 0].tick_params(axis='x', rotation=45, labelsize=12)
axes[1, 0].tick_params(axis='y', labelsize=12)

# Plot 5: Socioeconomic Risk
socio_analysis['readmission_rate'].plot(kind='bar', ax=axes[1, 1], 
                                       color=professional_colors[3:7])
axes[1, 1].set_title('Readmission Rate by Socioeconomic Risk', fontsize=16, fontweight='bold', pad=20)
axes[1, 1].set_xlabel('Socioeconomic Risk Level', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Readmission Rate (%)', fontsize=14, fontweight='bold')
axes[1, 1].tick_params(axis='x', rotation=45, labelsize=12)
axes[1, 1].tick_params(axis='y', labelsize=12)

# Plot 6: Clinical Severity
severity_analysis['readmission_rate'].plot(kind='bar', ax=axes[1, 2], 
                                          color=professional_colors[4:8])
axes[1, 2].set_title('Readmission Rate by Clinical Severity', fontsize=16, fontweight='bold', pad=20)
axes[1, 2].set_xlabel('Severity Level', fontsize=14, fontweight='bold')
axes[1, 2].set_ylabel('Readmission Rate (%)', fontsize=14, fontweight='bold')
axes[1, 2].tick_params(axis='x', rotation=45, labelsize=12)
axes[1, 2].tick_params(axis='y', labelsize=12)

# Add value labels on bars for better readability
for ax in axes.flat:
    if ax.get_children():  # Check if plot has data
        for container in ax.containers:
            ax.bar_label(container, fmt='%.1f%%', fontsize=10, padding=3)

# Adjust layout with more space
plt.tight_layout(pad=3.0, h_pad=2.0, w_pad=2.0)
plt.show()


In [None]:
# 🏥 PHASE 1: CLINICAL DOMAIN FEATURES
print("🏥 CREATING CLINICAL DOMAIN FEATURES")
print("=" * 50)

# 1. MEDICATION COMPLEXITY SCORE
print("1️⃣ Creating Medication Complexity Score...")
df['medication_complexity'] = (
    df['num_medications'] * 0.4 +
    df['number_diagnoses'] * 0.3 +
    df['time_in_hospital'] * 0.3
)

# 2. CLINICAL RISK STRATIFICATION
print("2️⃣ Creating Clinical Risk Stratification...")
df['clinical_risk_score'] = (
    (df['num_procedures'] > df['num_procedures'].median()).astype(int) * 2 +
    (df['num_lab_procedures'] > df['num_lab_procedures'].median()).astype(int) * 1.5 +
    (df['number_diagnoses'] > df['number_diagnoses'].median()).astype(int) * 2 +
    (df['time_in_hospital'] > df['time_in_hospital'].median()).astype(int) * 1.5
)

# Risk categories
df['risk_category'] = pd.cut(df['clinical_risk_score'], 
                             bins=[0, 2, 4, 6, 10], 
                             labels=['Low', 'Medium', 'High', 'Critical'])

# 3. TREATMENT ADHERENCE INDEX
print("3️⃣ Creating Treatment Adherence Index...")
df['treatment_adherence'] = (
    (df['num_medications'] > 0).astype(int) * 0.4 +
    (df['num_procedures'] > 0).astype(int) * 0.3 +
    (df['num_lab_procedures'] > 0).astype(int) * 0.3
)

# 4. COMORBIDITY PATTERNS
print("4️⃣ Creating Comorbidity Patterns...")
df['comorbidity_count'] = (
    (df['diag_1'] != '?').astype(int) +
    (df['diag_2'] != '?').astype(int) +
    (df['diag_3'] != '?').astype(int)
)

df['comorbidity_severity'] = df['comorbidity_count'] * df['number_diagnoses']

# 5. LABORATORY EFFICIENCY
print("5️⃣ Creating Laboratory Efficiency Metrics...")
df['lab_efficiency'] = df['num_lab_procedures'] / (df['time_in_hospital'] + 1)
df['lab_efficiency'] = df['lab_efficiency'].clip(0, 10)  # Cap at reasonable values

# 6. PROCEDURE INTENSITY
print("6️⃣ Creating Procedure Intensity Metrics...")
df['procedure_intensity'] = df['num_procedures'] / (df['time_in_hospital'] + 1)
df['procedure_intensity'] = df['procedure_intensity'].clip(0, 5)

print("✅ Clinical domain features created successfully!")
print(f"📊 New features: {[col for col in df.columns if col.startswith(('medication', 'clinical', 'treatment', 'comorbidity', 'lab', 'procedure'))]}")

In [None]:
# �� PHASE 2: DEMOGRAPHIC & SOCIOECONOMIC FEATURES
print("�� CREATING DEMOGRAPHIC & SOCIOECONOMIC FEATURES")
print("=" * 60)

# 1. AGE RISK GROUPS
print("1️⃣ Creating Age Risk Groups...")
def categorize_age_risk(age_str):
    if age_str == '?':
        return 'Unknown'
    elif age_str in ['[0-10)', '[10-20)', '[20-30)']:
        return 'Low_Risk'
    elif age_str in ['[30-40)', '[40-50)']:
        return 'Medium_Risk'
    elif age_str in ['[50-60)', '[60-70)']:
        return 'High_Risk'
    elif age_str in ['[70-80)', '[80-90)', '[90-100)']:
        return 'Critical_Risk'
    else:
        return 'Unknown'

df['age_risk_group'] = df['age'].apply(categorize_age_risk)

# 2. INSURANCE-AGE INTERACTION RISK
print("2️⃣ Creating Insurance-Age Interaction Risk...")
df['insurance_age_risk'] = df['payer_code'].astype(str) + '_' + df['age_risk_group'].astype(str)

# 3. GENDER-AGE RISK COMBINATIONS
print("3️⃣ Creating Gender-Age Risk Combinations...")
df['gender_age_risk'] = df['gender'] + '_' + df['age_risk_group'].astype(str)

# 4. SOCIOECONOMIC RISK INDEX
print("4️⃣ Creating Socioeconomic Risk Index...")
df['socioeconomic_risk'] = (
    (df['payer_code'] == '?').astype(int) * 2 +
    (df['gender'] == 'Unknown/Invalid').astype(int) * 1 +
    (df['age'] == '?').astype(int) * 1
)

# 5. LENGTH OF STAY RISK CATEGORIES
print("5️⃣ Creating Length of Stay Risk Categories...")
df['los_risk_category'] = pd.cut(df['time_in_hospital'], 
                                 bins=[0, 3, 7, 14, 30, 100], 
                                 labels=['Very_Low', 'Low', 'Medium', 'High', 'Critical'])

# 6. READMISSION RISK WINDOWS
print("6️⃣ Creating Readmission Risk Windows...")
df['readmission_7d'] = (df['readmitted'] == '<30').astype(int)  # 7-day approximation
df['readmission_15d'] = (df['readmitted'] == '<30').astype(int)  # 15-day approximation
df['readmission_30d'] = (df['readmitted'] == '<30').astype(int)  # 30-day (existing)
df['readmission_90d'] = (df['readmitted'].isin(['<30', '>30'])).astype(int)  # 90-day

print("✅ Demographic & socioeconomic features created successfully!")
print(f"📊 New features: {[col for col in df.columns if any(x in col for x in ['age_risk', 'insurance', 'gender', 'socioeconomic', 'los_risk', 'readmission_'])]}")

In [None]:
# 🤖 PHASE 3: ADVANCED ML FEATURES
print("🤖 CREATING ADVANCED ML FEATURES")
print("=" * 50)

# 1. POLYNOMIAL INTERACTIONS
print("1️⃣ Creating Polynomial Interactions...")
df['age_medication_interaction'] = df['age_risk_group'].astype(str) + '_' + df['num_medications'].astype(str)
df['diagnosis_procedure_interaction'] = df['number_diagnoses'] * df['num_procedures']
df['time_medication_efficiency'] = df['time_in_hospital'] * df['num_medications']

# 2. RATIO FEATURES
print("2️⃣ Creating Ratio Features...")
df['medications_per_day'] = df['num_medications'] / (df['time_in_hospital'] + 1)
df['procedures_per_day'] = df['num_procedures'] / (df['time_in_hospital'] + 1)
df['lab_procedures_per_day'] = df['num_lab_procedures'] / (df['time_in_hospital'] + 1)
df['diagnoses_per_day'] = df['number_diagnoses'] / (df['time_in_hospital'] + 1)

# 3. BINNED NUMERICAL FEATURES
print("3️⃣ Creating Binned Numerical Features...")
df['medications_binned'] = pd.cut(df['num_medications'], 
                                  bins=[0, 5, 10, 15, 20, 100], 
                                  labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High'])

df['diagnoses_binned'] = pd.cut(df['number_diagnoses'], 
                                 bins=[0, 3, 6, 9, 12, 100], 
                                 labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High'])

# 4. AGGREGATION FEATURES
print("4️⃣ Creating Aggregation Features...")
df['total_procedures'] = df['num_procedures'] + df['num_lab_procedures']
df['total_clinical_activities'] = df['num_procedures'] + df['num_lab_procedures'] + df['num_medications']
df['clinical_intensity'] = df['total_clinical_activities'] / (df['time_in_hospital'] + 1)

# 5. CATEGORICAL ENCODING PREPARATION
print("5️⃣ Preparing Categorical Encoding...")
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
print(f"📋 Categorical columns identified: {len(categorical_columns)}")
print(f"   • High cardinality columns: {[col for col in categorical_columns if df[col].nunique() > 20]}")

print("✅ Advanced ML features created successfully!")
print(f"📊 New features: {[col for col in df.columns if any(x in col for x in ['interaction', 'ratio', 'binned', 'total', 'intensity'])]}")

In [None]:
# �� FEATURE ANALYSIS & VALIDATION
print("�� ANALYZING ENGINEERED FEATURES")
print("=" * 50)

# 1. FEATURE SUMMARY STATISTICS
print("1️⃣ Feature Summary Statistics...")
new_features = [col for col in df.columns if col not in ['readmitted', 'readmission_30d']]
print(f"📊 Total features created: {len(new_features)}")

# 2. FEATURE CORRELATION ANALYSIS
print("2️⃣ Feature Correlation Analysis...")
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df[numerical_features].corr()

# Find highly correlated features
high_corr_features = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            high_corr_features.append((correlation_matrix.columns[i], 
                                     correlation_matrix.columns[j], 
                                     correlation_matrix.iloc[i, j]))

print(f"🔗 Highly correlated feature pairs (|r| > 0.8): {len(high_corr_features)}")
for pair in high_corr_features[:5]:  # Show first 5
    print(f"   • {pair[0]} ↔ {pair[1]}: r = {pair[2]:.3f}")

# 3. FEATURE IMPORTANCE ANALYSIS
print("3️⃣ Feature Importance Analysis...")
try:
    # Prepare data for feature importance
    feature_cols = [col for col in df.columns if col not in ['readmitted', 'readmission_30d'] and df[col].dtype in ['int64', 'float64']]
    X_temp = df[feature_cols].fillna(0)
    y_temp = df['readmission_30d']
    
    # Train a simple model for feature importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X_temp, y_temp)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"🏆 Top 10 Most Important Features:")
    print(feature_importance.head(10))
    
except Exception as e:
    print(f"⚠️ Feature importance analysis failed: {e}")
    print("   This is normal for some data types. Continuing...")

print("✅ Feature analysis completed successfully!")

In [None]:
# 📈 FEATURE VISUALIZATION DASHBOARD
print("📈 CREATING FEATURE VISUALIZATIONS")
print("=" * 50)

# Create comprehensive visualization dashboard
plt.figure(figsize=(24, 16))

# 1. Clinical Risk Distribution
plt.subplot(3, 4, 1)
df['risk_category'].value_counts().plot(kind='bar', color='darkred')
plt.title('Clinical Risk Category Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Risk Category')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# 2. Age Risk Groups
plt.subplot(3, 4, 2)
df['age_risk_group'].value_counts().plot(kind='bar', color='darkblue')
plt.title('Age Risk Group Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Age Risk Group')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# 3. Medication Complexity Distribution
plt.subplot(3, 4, 3)
plt.hist(df['medication_complexity'], bins=30, color='lightblue', alpha=0.7, edgecolor='black')
plt.title('Medication Complexity Score Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Medication Complexity Score')
plt.ylabel('Frequency')

# 4. Clinical Risk vs Readmission
plt.subplot(3, 4, 4)
risk_readmission = df.groupby(['risk_category', 'readmission_30d']).size().unstack(fill_value=0)
risk_readmission['readmission_rate'] = (risk_readmission[1] / risk_readmission.sum(axis=1) * 100).round(2)
risk_readmission['readmission_rate'].plot(kind='bar', color='darkgreen')
plt.title('Readmission Rate by Clinical Risk', fontweight='bold', fontsize=12)
plt.xlabel('Clinical Risk Category')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# 5. Length of Stay Risk
plt.subplot(3, 4, 5)
df['los_risk_category'].value_counts().plot(kind='bar', color='purple')
plt.title('Length of Stay Risk Distribution', fontweight='bold', fontsize=12)
plt.xlabel('LOS Risk Category')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# 6. Treatment Adherence
plt.subplot(3, 4, 6)
plt.hist(df['treatment_adherence'], bins=20, color='lightgreen', alpha=0.7, edgecolor='black')
plt.title('Treatment Adherence Index Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Treatment Adherence Index')
plt.ylabel('Frequency')

# 7. Comorbidity Severity
plt.subplot(3, 4, 7)
plt.hist(df['comorbidity_severity'], bins=20, color='lightcoral', alpha=0.7, edgecolor='black')
plt.title('Comorbidity Severity Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Comorbidity Severity Score')
plt.ylabel('Frequency')

# 8. Lab Efficiency
plt.subplot(3, 4, 8)
plt.hist(df['lab_efficiency'], bins=20, color='lightyellow', alpha=0.7, edgecolor='black')
plt.title('Laboratory Efficiency Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Lab Efficiency Score')
plt.ylabel('Frequency')

# 9. Socioeconomic Risk
plt.subplot(3, 4, 9)
df['socioeconomic_risk'].value_counts().sort_index().plot(kind='bar', color='orange')
plt.title('Socioeconomic Risk Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Socioeconomic Risk Score')
plt.ylabel('Number of Patients')
plt.xticks(rotation=0)

# 10. Clinical Intensity
plt.subplot(3, 4, 10)
plt.hist(df['clinical_intensity'], bins=20, color='lightsteelblue', alpha=0.7, edgecolor='black')
plt.title('Clinical Intensity Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Clinical Intensity Score')
plt.ylabel('Frequency')

# 11. Readmission Risk Windows
plt.subplot(3, 4, 11)
readmission_windows = ['readmission_7d', 'readmission_15d', 'readmission_30d', 'readmission_90d']
readmission_rates = [df[col].mean() * 100 for col in readmission_windows]
plt.bar(['7d', '15d', '30d', '90d'], readmission_rates, color=['red', 'orange', 'yellow', 'green'])
plt.title('Readmission Rates by Time Window', fontweight='bold', fontsize=12)
plt.xlabel('Time Window')
plt.ylabel('Readmission Rate (%)')

# 12. Feature Correlation Heatmap (Top 15)
plt.subplot(3, 4, 12)
top_features = feature_importance.head(15)['feature'].tolist() if 'feature_importance' in locals() else numerical_features[:15]
corr_subset = df[top_features].corr()
sns.heatmap(corr_subset, annot=False, cmap='coolwarm', center=0, square=True, cbar_kws={'shrink': 0.8})
plt.title('Top Features Correlation Heatmap', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.show()

print("✅ Feature visualizations created successfully!")

In [None]:
# 💾 FEATURE ENGINEERING SUMMARY & EXPORT
print("💾 FEATURE ENGINEERING SUMMARY & EXPORT")
print("=" * 50)

# 1. FEATURE SUMMARY
print("1️⃣ FEATURE ENGINEERING SUMMARY:")
print(f"   �� Original features: {len([col for col in df.columns if col in ['readmitted', 'readmission_30d']])}")
print(f"   🚀 New features created: {len([col for col in df.columns if col not in ['readmitted', 'readmission_30d']])}")
print(f"   🎯 Total features: {len(df.columns)}")

# 2. FEATURE CATEGORIES
print("\n2️⃣ FEATURE CATEGORIES:")
clinical_features = [col for col in df.columns if any(x in col for x in ['medication', 'clinical', 'treatment', 'comorbidity', 'lab', 'procedure'])]
demographic_features = [col for col in df.columns if any(x in col for x in ['age_risk', 'insurance', 'gender', 'socioeconomic', 'los_risk'])]
ml_features = [col for col in df.columns if any(x in col for x in ['interaction', 'ratio', 'binned', 'total', 'intensity'])]

print(f"   �� Clinical features: {len(clinical_features)}")
print(f"   👥 Demographic features: {len(demographic_features)}")
print(f"   �� ML features: {len(ml_features)}")

# 3. DATA QUALITY CHECK
print("\n3️⃣ DATA QUALITY CHECK:")
missing_after = df.isnull().sum().sum()
print(f"   ❌ Missing values: {missing_after:,}")
print(f"   ✅ Data shape: {df.shape}")
print(f"   💾 Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 4. EXPORT ENGINEERED DATASET
print("\n4️⃣ EXPORTING ENGINEERED DATASET...")
output_filename = 'diabetic_data_engineered.csv'
df.to_csv(output_filename, index=False)
print(f"   ✅ Dataset exported to: {output_filename}")
print(f"   📁 File size: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 5. FEATURE LIST EXPORT
print("\n5️⃣ EXPORTING FEATURE LIST...")
feature_list = pd.DataFrame({
    'feature_name': df.columns.tolist(),
    'feature_type': df.dtypes.astype(str).tolist(),
    'unique_values': [df[col].nunique() for col in df.columns],
    'missing_values': df.isnull().sum().tolist(),
    'category': ['Target' if col in ['readmitted', 'readmission_30d'] else 
                 'Clinical' if col in clinical_features else
                 'Demographic' if col in demographic_features else
                 'ML' if col in ml_features else 'Original' for col in df.columns]
})

feature_list.to_csv('feature_engineering_summary.csv', index=False)
print(f"   ✅ Feature list exported to: feature_engineering_summary.csv")

print("\n🎉 FEATURE ENGINEERING COMPLETED SUCCESSFULLY! ��")
print("=" * 60)
print(f"📊 Total features: {len(df.columns)}")
print(f"🚀 New features created: {len([col for col in df.columns if col not in ['readmitted', 'readmission_30d']])}")
print(f"💾 Dataset exported: {output_filename}")
print(f"📋 Feature summary: feature_engineering_summary.csv")
print("\n🎯 NEXT STEPS:")
print("   1. Review feature engineering summary")
print("   2. Proceed to Power BI dashboard creation")
print("   3. Model development and training")
print("   4. MLOps pipeline implementation")