# 🚀 DIABETIC READMISSION PREDICTION - FEATURE ENGINEERING

**Phase 2: Advanced Feature Engineering Pipeline**

This notebook implements a comprehensive feature engineering strategy for hospital readmission prediction:
1. **Clinical Domain Features** - Medical expertise-driven features
2. **Demographic & Socioeconomic Features** - Patient risk factors
3. **Advanced ML Features** - Performance optimization features

**Author**: Data Science Portfolio Project
**Date**: August 2024
**Goal**: Create production-ready features for ML pipeline

## 📚 IMPORTS & SETUP

In [None]:
# 🚀 FEATURE ENGINEERING IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")
print(f"📈 Matplotlib version: {plt.matplotlib.__version__}")

## 📥 DATA LOADING & PREPARATION

In [None]:
# 📥 LOAD PREPARED DATA
print("📥 Loading prepared data...")

# Load the main dataset
df = pd.read_csv('diabetic_data.csv')

# Load our EDA target variable if it exists
if 'readmission_30d' not in df.columns:
    print("⚠️ Target variable not found. Creating it now...")
    df['readmission_30d'] = (df['readmitted'] == '<30').astype(int)

print(f"✅ Data loaded successfully!")
print(f"📊 Shape: {df.shape}")
print(f"🎯 Target distribution: {df['readmission_30d'].value_counts().to_dict()}")
print(f"📋 Columns: {len(df.columns)}")

# Display first few rows
df.head()

## 🔍 DATA QUALITY CHECK

In [None]:
# 🔍 DATA QUALITY ASSESSMENT
print("🔍 ASSESSING DATA QUALITY FOR FEATURE ENGINEERING")
print("=" * 60)

# Check data types
print("📋 Data Types:")
print(df.dtypes.value_counts())

# Check for missing values
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

print(f"\n❌ Missing Values:")
print(f"   • Total missing: {missing_data.sum():,}")
print(f"   • Columns with missing: {(missing_data > 0).sum()}")
print(f"   • Max missing %: {missing_percent.max():.2f}%")

# Check for '?' values (common in this dataset)
question_marks = (df == '?').sum().sum()
print(f"\n❓ Question marks ('?'): {question_marks:,}")

# Memory usage
memory_usage = df.memory_usage(deep=True).sum() / 1024**2
print(f"\n💾 Memory usage: {memory_usage:.2f} MB")

# Unique values per column
print(f"\n🔢 Unique values per column:")
for col in df.columns:
    unique_count = df[col].nunique()
    if unique_count < 50:  # Only show columns with reasonable unique values
        print(f"   • {col}: {unique_count} unique values")

## 🏥 PHASE 1: CLINICAL DOMAIN FEATURES

In [None]:
# 🏥 PHASE 1: CLINICAL DOMAIN FEATURES
print("🏥 CREATING CLINICAL DOMAIN FEATURES")
print("=" * 50)

# 1. MEDICATION COMPLEXITY SCORE
print("1️⃣ Creating Medication Complexity Score...")
df['medication_complexity'] = (
    df['num_medications'] * 0.4 +
    df['number_diagnoses'] * 0.3 +
    df['time_in_hospital'] * 0.3
)

# 2. CLINICAL RISK STRATIFICATION
print("2️⃣ Creating Clinical Risk Stratification...")
df['clinical_risk_score'] = (
    (df['num_procedures'] > df['num_procedures'].median()).astype(int) * 2 +
    (df['num_lab_procedures'] > df['num_lab_procedures'].median()).astype(int) * 1.5 +
    (df['number_diagnoses'] > df['number_diagnoses'].median()).astype(int) * 2 +
    (df['time_in_hospital'] > df['time_in_hospital'].median()).astype(int) * 1.5
)

# Risk categories
df['risk_category'] = pd.cut(df['clinical_risk_score'], 
                             bins=[0, 2, 4, 6, 10], 
                             labels=['Low', 'Medium', 'High', 'Critical'])

# 3. TREATMENT ADHERENCE INDEX
print("3️⃣ Creating Treatment Adherence Index...")
df['treatment_adherence'] = (
    (df['num_medications'] > 0).astype(int) * 0.4 +
    (df['num_procedures'] > 0).astype(int) * 0.3 +
    (df['num_lab_procedures'] > 0).astype(int) * 0.3
)

# 4. COMORBIDITY PATTERNS
print("4️⃣ Creating Comorbidity Patterns...")
df['comorbidity_count'] = (
    (df['diag_1'] != '?').astype(int) +
    (df['diag_2'] != '?').astype(int) +
    (df['diag_3'] != '?').astype(int)
)

df['comorbidity_severity'] = df['comorbidity_count'] * df['number_diagnoses']

# 5. LABORATORY EFFICIENCY
print("5️⃣ Creating Laboratory Efficiency Metrics...")
df['lab_efficiency'] = df['num_lab_procedures'] / (df['time_in_hospital'] + 1)
df['lab_efficiency'] = df['lab_efficiency'].clip(0, 10)  # Cap at reasonable values

# 6. PROCEDURE INTENSITY
print("6️⃣ Creating Procedure Intensity Metrics...")
df['procedure_intensity'] = df['num_procedures'] / (df['time_in_hospital'] + 1)
df['procedure_intensity'] = df['procedure_intensity'].clip(0, 5)

print("✅ Clinical domain features created successfully!")
print(f"📊 New features: {[col for col in df.columns if col.startswith(('medication', 'clinical', 'treatment', 'comorbidity', 'lab', 'procedure'))]}")

## 👥 PHASE 2: DEMOGRAPHIC & SOCIOECONOMIC FEATURES

In [None]:
# 👥 PHASE 2: DEMOGRAPHIC & SOCIOECONOMIC FEATURES
print("👥 CREATING DEMOGRAPHIC & SOCIOECONOMIC FEATURES")
print("=" * 60)

# 1. AGE RISK GROUPS
print("1️⃣ Creating Age Risk Groups...")
def categorize_age_risk(age_str):
    if age_str == '?':
        return 'Unknown'
    elif age_str in ['[0-10)', '[10-20)', '[20-30)']:
        return 'Low_Risk'
    elif age_str in ['[30-40)', '[40-50)']:
        return 'Medium_Risk'
    elif age_str in ['[50-60)', '[60-70)']:
        return 'High_Risk'
    elif age_str in ['[70-80)', '[80-90)', '[90-100)']:
        return 'Critical_Risk'
    else:
        return 'Unknown'

df['age_risk_group'] = df['age'].apply(categorize_age_risk)

# 2. INSURANCE-AGE INTERACTION RISK
print("2️⃣ Creating Insurance-Age Interaction Risk...")
df['insurance_age_risk'] = df['payer_code'].astype(str) + '_' + df['age_risk_group'].astype(str)

# 3. GENDER-AGE RISK COMBINATIONS
print("3️⃣ Creating Gender-Age Risk Combinations...")
df['gender_age_risk'] = df['gender'] + '_' + df['age_risk_group'].astype(str)

# 4. SOCIOECONOMIC RISK INDEX
print("4️⃣ Creating Socioeconomic Risk Index...")
df['socioeconomic_risk'] = (
    (df['payer_code'] == '?').astype(int) * 2 +
    (df['gender'] == 'Unknown/Invalid').astype(int) * 1 +
    (df['age'] == '?').astype(int) * 1
)

# 5. LENGTH OF STAY RISK CATEGORIES
print("5️⃣ Creating Length of Stay Risk Categories...")
df['los_risk_category'] = pd.cut(df['time_in_hospital'], 
                                 bins=[0, 3, 7, 14, 30, 100], 
                                 labels=['Very_Low', 'Low', 'Medium', 'High', 'Critical'])

# 6. READMISSION RISK WINDOWS
print("6️⃣ Creating Readmission Risk Windows...")
df['readmission_7d'] = (df['readmitted'] == '<30').astype(int)  # 7-day approximation
df['readmission_15d'] = (df['readmitted'] == '<30').astype(int)  # 15-day approximation
df['readmission_30d'] = (df['readmitted'] == '<30').astype(int)  # 30-day (existing)
df['readmission_90d'] = (df['readmitted'].isin(['<30', '>30'])).astype(int)  # 90-day

print("✅ Demographic & socioeconomic features created successfully!")
print(f"📊 New features: {[col for col in df.columns if any(x in col for x in ['age_risk', 'insurance', 'gender', 'socioeconomic', 'los_risk', 'readmission_'])]}")

## 🤖 PHASE 3: ADVANCED ML FEATURES

In [None]:
# 🤖 PHASE 3: ADVANCED ML FEATURES
print("🤖 CREATING ADVANCED ML FEATURES")
print("=" * 50)

# 1. POLYNOMIAL INTERACTIONS
print("1️⃣ Creating Polynomial Interactions...")
df['age_medication_interaction'] = df['age_risk_group'].astype(str) + '_' + df['num_medications'].astype(str)
df['diagnosis_procedure_interaction'] = df['number_diagnoses'] * df['num_procedures']
df['time_medication_efficiency'] = df['time_in_hospital'] * df['num_medications']

# 2. RATIO FEATURES
print("2️⃣ Creating Ratio Features...")
df['medications_per_day'] = df['num_medications'] / (df['time_in_hospital'] + 1)
df['procedures_per_day'] = df['num_procedures'] / (df['time_in_hospital'] + 1)
df['lab_procedures_per_day'] = df['num_lab_procedures'] / (df['time_in_hospital'] + 1)
df['diagnoses_per_day'] = df['number_diagnoses'] / (df['time_in_hospital'] + 1)

# 3. BINNED NUMERICAL FEATURES
print("3️⃣ Creating Binned Numerical Features...")
df['medications_binned'] = pd.cut(df['num_medications'], 
                                  bins=[0, 5, 10, 15, 20, 100], 
                                  labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High'])

df['diagnoses_binned'] = pd.cut(df['number_diagnoses'], 
                                 bins=[0, 3, 6, 9, 12, 100], 
                                 labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High'])

# 4. AGGREGATION FEATURES
print("4️⃣ Creating Aggregation Features...")
df['total_procedures'] = df['num_procedures'] + df['num_lab_procedures']
df['total_clinical_activities'] = df['num_procedures'] + df['num_lab_procedures'] + df['num_medications']
df['clinical_intensity'] = df['total_clinical_activities'] / (df['time_in_hospital'] + 1)

# 5. CATEGORICAL ENCODING PREPARATION
print("5️⃣ Preparing Categorical Encoding...")
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
print(f"📋 Categorical columns identified: {len(categorical_columns)}")
print(f"   • High cardinality columns: {[col for col in categorical_columns if df[col].nunique() > 20]}")

print("✅ Advanced ML features created successfully!")
print(f"📊 New features: {[col for col in df.columns if any(x in col for x in ['interaction', 'ratio', 'binned', 'total', 'intensity'])]}")

## 📊 FEATURE ANALYSIS & VALIDATION

In [None]:
# 📊 FEATURE ANALYSIS & VALIDATION
print("📊 ANALYZING ENGINEERED FEATURES")
print("=" * 50)

# 1. FEATURE SUMMARY STATISTICS
print("1️⃣ Feature Summary Statistics...")
new_features = [col for col in df.columns if col not in ['readmitted', 'readmission_30d']]
print(f"📊 Total features created: {len(new_features)}")

# 2. FEATURE CORRELATION ANALYSIS
print("2️⃣ Feature Correlation Analysis...")
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df[numerical_features].corr()

# Find highly correlated features
high_corr_features = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            high_corr_features.append((correlation_matrix.columns[i], 
                                     correlation_matrix.columns[j], 
                                     correlation_matrix.iloc[i, j]))

print(f"🔗 Highly correlated feature pairs (|r| > 0.8): {len(high_corr_features)}")
for pair in high_corr_features[:5]:  # Show first 5
    print(f"   • {pair[0]} ↔ {pair[1]}: r = {pair[2]:.3f}")

# 3. FEATURE IMPORTANCE ANALYSIS
print("3️⃣ Feature Importance Analysis...")
try:
    # Prepare data for feature importance
    feature_cols = [col for col in df.columns if col not in ['readmitted', 'readmission_30d'] and df[col].dtype in ['int64', 'float64']]
    X_temp = df[feature_cols].fillna(0)
    y_temp = df['readmission_30d']
    
    # Train a simple model for feature importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X_temp, y_temp)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"🏆 Top 10 Most Important Features:")
    print(feature_importance.head(10))
    
except Exception as e:
    print(f"⚠️ Feature importance analysis failed: {e}")
    print("   This is normal for some data types. Continuing...")

print("✅ Feature analysis completed successfully!")

## 📈 FEATURE VISUALIZATION

In [None]:
# 📈 FEATURE VISUALIZATION
print("📈 CREATING FEATURE VISUALIZATIONS")
print("=" * 50)

# Create comprehensive visualization dashboard
plt.figure(figsize=(24, 16))

# 1. Clinical Risk Distribution
plt.subplot(3, 4, 1)
df['risk_category'].value_counts().plot(kind='bar', color='darkred')
plt.title('Clinical Risk Category Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Risk Category')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# 2. Age Risk Groups
plt.subplot(3, 4, 2)
df['age_risk_group'].value_counts().plot(kind='bar', color='darkblue')
plt.title('Age Risk Group Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Age Risk Group')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# 3. Medication Complexity Distribution
plt.subplot(3, 4, 3)
plt.hist(df['medication_complexity'], bins=30, color='lightblue', alpha=0.7, edgecolor='black')
plt.title('Medication Complexity Score Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Medication Complexity Score')
plt.ylabel('Frequency')

# 4. Clinical Risk vs Readmission
plt.subplot(3, 4, 4)
risk_readmission = df.groupby(['risk_category', 'readmission_30d']).size().unstack(fill_value=0)
risk_readmission['readmission_rate'] = (risk_readmission[1] / risk_readmission.sum(axis=1) * 100).round(2)
risk_readmission['readmission_rate'].plot(kind='bar', color='darkgreen')
plt.title('Readmission Rate by Clinical Risk', fontweight='bold', fontsize=12)
plt.xlabel('Clinical Risk Category')
plt.ylabel('Readmission Rate (%)')
plt.xticks(rotation=45)

# 5. Length of Stay Risk
plt.subplot(3, 4, 5)
df['los_risk_category'].value_counts().plot(kind='bar', color='purple')
plt.title('Length of Stay Risk Distribution', fontweight='bold', fontsize=12)
plt.xlabel('LOS Risk Category')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# 6. Treatment Adherence
plt.subplot(3, 4, 6)
plt.hist(df['treatment_adherence'], bins=20, color='lightgreen', alpha=0.7, edgecolor='black')
plt.title('Treatment Adherence Index Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Treatment Adherence Index')
plt.ylabel('Frequency')

# 7. Comorbidity Severity
plt.subplot(3, 4, 7)
plt.hist(df['comorbidity_severity'], bins=20, color='lightcoral', alpha=0.7, edgecolor='black')
plt.title('Comorbidity Severity Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Comorbidity Severity Score')
plt.ylabel('Frequency')

# 8. Lab Efficiency
plt.subplot(3, 4, 8)
plt.hist(df['lab_efficiency'], bins=20, color='lightyellow', alpha=0.7, edgecolor='black')
plt.title('Laboratory Efficiency Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Lab Efficiency Score')
plt.ylabel('Frequency')

# 9. Socioeconomic Risk
plt.subplot(3, 4, 9)
df['socioeconomic_risk'].value_counts().sort_index().plot(kind='bar', color='orange')
plt.title('Socioeconomic Risk Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Socioeconomic Risk Score')
plt.ylabel('Number of Patients')
plt.xticks(rotation=0)

# 10. Clinical Intensity
plt.subplot(3, 4, 10)
plt.hist(df['clinical_intensity'], bins=20, color='lightsteelblue', alpha=0.7, edgecolor='black')
plt.title('Clinical Intensity Distribution', fontweight='bold', fontsize=12)
plt.xlabel('Clinical Intensity Score')
plt.ylabel('Frequency')

# 11. Readmission Risk Windows
plt.subplot(3, 4, 11)
readmission_windows = ['readmission_7d', 'readmission_15d', 'readmission_30d', 'readmission_90d']
readmission_rates = [df[col].mean() * 100 for col in readmission_windows]
plt.bar(['7d', '15d', '30d', '90d'], readmission_rates, color=['red', 'orange', 'yellow', 'green'])
plt.title('Readmission Rates by Time Window', fontweight='bold', fontsize=12)
plt.xlabel('Time Window')
plt.ylabel('Readmission Rate (%)')

# 12. Feature Correlation Heatmap (Top 15)
plt.subplot(3, 4, 12)
top_features = feature_importance.head(15)['feature'].tolist() if 'feature_importance' in locals() else numerical_features[:15]
corr_subset = df[top_features].corr()
sns.heatmap(corr_subset, annot=False, cmap='coolwarm', center=0, square=True, cbar_kws={'shrink': 0.8})
plt.title('Top Features Correlation Heatmap', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.show()

print("✅ Feature visualizations created successfully!")

## 💾 FEATURE ENGINEERING SUMMARY & EXPORT

In [None]:
# 💾 FEATURE ENGINEERING SUMMARY & EXPORT
print("💾 FEATURE ENGINEERING SUMMARY & EXPORT")
print("=" * 50)

# 1. FEATURE SUMMARY
print("1️⃣ FEATURE ENGINEERING SUMMARY:")
print(f"   📊 Original features: {len([col for col in df.columns if col in ['readmitted', 'readmission_30d']])}")
print(f"   🚀 New features created: {len([col for col in df.columns if col not in ['readmitted', 'readmission_30d']])}")
print(f"   🎯 Total features: {len(df.columns)}")

# 2. FEATURE CATEGORIES
print("\n2️⃣ FEATURE CATEGORIES:")
clinical_features = [col for col in df.columns if any(x in col for x in ['medication', 'clinical', 'treatment', 'comorbidity', 'lab', 'procedure'])]
demographic_features = [col for col in df.columns if any(x in col for x in ['age_risk', 'insurance', 'gender', 'socioeconomic', 'los_risk'])]
ml_features = [col for col in df.columns if any(x in col for x in ['interaction', 'ratio', 'binned', 'total', 'intensity'])]

print(f"   🏥 Clinical features: {len(clinical_features)}")
print(f"   👥 Demographic features: {len(demographic_features)}")
print(f"   🤖 ML features: {len(ml_features)}")

# 3. DATA QUALITY CHECK
print("\n3️⃣ DATA QUALITY CHECK:")
missing_after = df.isnull().sum().sum()
print(f"   ❌ Missing values: {missing_after:,}")
print(f"   ✅ Data shape: {df.shape}")
print(f"   💾 Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 4. EXPORT ENGINEERED DATASET
print("\n4️⃣ EXPORTING ENGINEERED DATASET...")
output_filename = 'diabetic_data_engineered.csv'
df.to_csv(output_filename, index=False)
print(f"   ✅ Dataset exported to: {output_filename}")
print(f"   📁 File size: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 5. FEATURE LIST EXPORT
print("\n5️⃣ EXPORTING FEATURE LIST...")
feature_list = pd.DataFrame({
    'feature_name': df.columns.tolist(),
    'feature_type': df.dtypes.astype(str).tolist(),
    'unique_values': [df[col].nunique() for col in df.columns],
    'missing_values': df.isnull().sum().tolist(),
    'category': ['Target' if col in ['readmitted', 'readmission_30d'] else 
                 'Clinical' if col in clinical_features else
                 'Demographic' if col in demographic_features else
                 'ML' if col in ml_features else 'Original' for col in df.columns]
})

feature_list.to_csv('feature_engineering_summary.csv', index=False)
print(f"   ✅ Feature list exported to: feature_engineering_summary.csv")

print("\n🎉 FEATURE ENGINEERING COMPLETED SUCCESSFULLY! 🎉")
print("=" * 60)
print(f"📊 Total features: {len(df.columns)}")
print(f"🚀 New features created: {len([col for col in df.columns if col not in ['readmitted', 'readmission_30d']])}")
print(f"💾 Dataset exported: {output_filename}")
print(f"📋 Feature summary: feature_engineering_summary.csv")
print("\n🎯 NEXT STEPS:")
print("   1. Review feature engineering summary")
print("   2. Proceed to Power BI dashboard creation")
print("   3. Model development and training")
print("   4. MLOps pipeline implementation")