In [1]:
# Cell 1: Setup & Imports
import pandas as pd
import numpy as np
import time
import joblib # For saving models/preprocessors
import json # For pretty printing dicts
from collections import defaultdict # For mappings

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model
import lightgbm as lgb
from sklearn.multioutput import MultiOutputClassifier

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Suppress specific warnings if needed
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

print("Libraries imported.")
print("--- Training Model on SYNTHETIC Dataset (v2) ---")

Libraries imported.
--- Training Model on SYNTHETIC Dataset (v2) ---


In [2]:
# Cell 2: Load Data
# --- Use the SYNTHETIC dataset generated previously ---
file_path = '/content/Synthetic_Medical_Dataset_v2.csv'

print(f"Attempting to load data from: {file_path}")
try:
    df = pd.read_csv(file_path)
    print(f"\nData loaded successfully from {file_path}")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    print("Please ensure the 'Synthetic_Medical_Dataset_v2.csv' file is in the correct directory.")
    raise SystemExit("Stopping execution: Dataset not found.")

# Display basic info
print("\nDataset Info:")
df.info()
print("\nSample Records (showing potential inputs and outputs):")
print(df.head(3))
print("\nChecking for missing values (should be few or none from generation):")
print(df.isnull().sum())

# Simple Fillna just in case (using mode for categorical, 0 for numerical)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    elif pd.api.types.is_numeric_dtype(df[col]) and col != 'Patient_ID':
         # Fill numerical NaNs (except Patient_ID) with 0 - adjust if median/mean is better
         df[col] = df[col].fillna(0)

print("\nMissing values handled (if any).")

Attempting to load data from: /content/Synthetic_Medical_Dataset_v2.csv

Data loaded successfully from /content/Synthetic_Medical_Dataset_v2.csv
Dataset shape: (15000, 36)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Patient_ID                   15000 non-null  int64  
 1   Age                          15000 non-null  int64  
 2   Gender                       15000 non-null  object 
 3   Blood_Group                  15000 non-null  object 
 4   Weight_kg                    15000 non-null  float64
 5   Has_Fever                    15000 non-null  int64  
 6   Has_Cough                    15000 non-null  int64  
 7   Has_Fatigue                  15000 non-null  int64  
 8   Has_Pain                     15000 non-null  int64  
 9   Has_Hypertension             15000 non-null  int64  
 10  Has

In [3]:
# Cell 3: Data Preparation & Initial Exploration

print("\n--- Data Preparation & Exploratory Analysis ---")

# Load the data (assuming CSV files are already uploaded to the Colab environment)
try:
    # Load main dataset
    data = pd.read_csv('/content/Synthetic_Medical_Dataset_v2.csv')  # Update with your actual filename
    print(f"Loaded main dataset with {data.shape[0]} rows and {data.shape[1]} columns")

    # Load disease-symptom mappings if available
    try:
        mapping_data = pd.read_csv('disease_mappings.csv')  # Update with your actual filename
        print(f"Loaded mappings data with {mapping_data.shape[0]} rows and {mapping_data.shape[1]} columns")
    except Exception as e:
        print(f"Note: No separate mappings file loaded: {e}")
        mapping_data = None
except FileNotFoundError:
    print("Error: Dataset file not found. Please upload the dataset files.")
    raise

# Display dataset information
print("\nData Overview:")
print(data.head())
print("\nData Types:")
print(data.dtypes)

# Check for missing values
missing_values = data.isnull().sum()
if missing_values.sum() > 0:
    print("\nMissing Values Per Column:")
    print(missing_values[missing_values > 0])

    # Handle missing values
    print("\nHandling missing values...")
    # Numeric columns: fill with median
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        if data[col].isnull().sum() > 0:
            data[col] = data[col].fillna(data[col].median())

    # Categorical columns: fill with mode
    cat_cols = data.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        if data[col].isnull().sum() > 0:
            data[col] = data[col].fillna(data[col].mode()[0])
else:
    print("\nNo missing values found in the dataset.")

# Separate features and target variables
# Identify target columns (modify based on your dataset structure)
target_cols = ['Predicted_Disease']
if 'Medicine_1' in data.columns:
    target_cols.extend(['Medicine_1', 'Dosage_1', 'Frequency_1', 'Duration_1'])
if 'Medicine_2' in data.columns:
    target_cols.extend(['Medicine_2', 'Dosage_2', 'Frequency_2', 'Duration_2'])
if 'Medicine_3' in data.columns:
    target_cols.extend(['Medicine_3', 'Dosage_3', 'Frequency_3', 'Duration_3'])
if 'Polypharmacy_Risk' in data.columns:
    target_cols.append('Polypharmacy_Risk')

print(f"\nIdentified {len(target_cols)} target columns: {target_cols}")

# Split data into features (X) and targets (y)
X = data.drop(columns=target_cols)
y_ml = data[target_cols]

print(f"\nFeatures data shape (X): {X.shape}")
print(f"Target data shape (y_ml): {y_ml.shape}")

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nIdentified {len(categorical_features)} categorical features: {categorical_features}")
print(f"Identified {len(numerical_features)} numerical features: {numerical_features}")

# Basic statistics for numerical features
print("\nBasic statistics for numerical features:")
print(X[numerical_features].describe())

# Class distribution for primary target
if 'Predicted_Disease' in y_ml.columns:
    print("\nClass distribution for Predicted_Disease:")
    disease_counts = y_ml['Predicted_Disease'].value_counts()
    print(disease_counts)

    # Check for class imbalance
    class_imbalance = disease_counts.max() / disease_counts.min()
    print(f"Class imbalance ratio (max/min): {class_imbalance:.2f}")
    if class_imbalance > 10:
        print("Warning: Significant class imbalance detected.")


--- Data Preparation & Exploratory Analysis ---
Loaded main dataset with 15000 rows and 36 columns
Note: No separate mappings file loaded: [Errno 2] No such file or directory: 'disease_mappings.csv'

Data Overview:
   Patient_ID  Age  Gender Blood_Group  Weight_kg  Has_Fever  Has_Cough  \
0           1   61    Male          B+       92.9          0          0   
1           2   45    Male         AB+       82.7          0          0   
2           3   23    Male          B+       64.0          0          0   
3           4   49  Female          B-       77.1          1          0   
4           5   44  Female          O+       99.3          0          0   

   Has_Fatigue  Has_Pain  Has_Hypertension  ...       Duration_2  \
0            0         1                 0  ...        As needed   
1            1         0                 1  ...  Short-term only   
2            1         1                 0  ...        As needed   
3            0         1                 0  ...        As nee

In [4]:
# Cell 4: Preprocessing - Target Variables (y_ml) using LabelEncoder

print("\n--- Encoding ML Target Variables (y_ml) ---")
# y_ml now contains only non-constant columns identified in Cell 3

target_encoders = {}
y_ml_encoded = pd.DataFrame(index=y_ml.index) # Will use the filtered y_ml

if y_ml.empty:
    print("Warning: y_ml is empty. No target variables to encode.")
else:
    for col in y_ml.columns: # Iterates over the filtered, non-constant columns
        le = LabelEncoder()
        try:
            # Ensure consistent type (string) for encoder, handle potential mixed types gracefully
            y_ml_encoded[col] = le.fit_transform(y_ml[col].astype(str))
            target_encoders[col] = le
            # Print info for primary target and low-cardinality targets
            if col == 'Predicted_Disease' or len(le.classes_) < 10:
                print(f"Encoded '{col}'. Classes: {len(le.classes_)}")
        except Exception as e:
            print(f"Error encoding column '{col}': {e}")
            # Optionally decide how to handle error: skip column, raise error, etc.

    print(f"\nEncoded {len(target_encoders)} non-constant target columns for the ML model.")
    # Display head of encoded data for verification
    # print("\nHead of encoded ML targets (y_ml_encoded):")
    # print(y_ml_encoded.head())


--- Encoding ML Target Variables (y_ml) ---
Encoded 'Predicted_Disease'. Classes: 13
Encoded 'Frequency_1'. Classes: 6
Encoded 'Duration_1'. Classes: 7
Encoded 'Frequency_2'. Classes: 7
Encoded 'Duration_2'. Classes: 6
Encoded 'Medicine_3'. Classes: 3
Encoded 'Dosage_3'. Classes: 3
Encoded 'Frequency_3'. Classes: 3
Encoded 'Duration_3'. Classes: 2
Encoded 'Polypharmacy_Risk'. Classes: 3

Encoded 14 non-constant target columns for the ML model.


In [5]:
# Cell 5: Preprocessing - Input Features (X) with Feature Engineering

print("\n--- Defining Preprocessing Pipeline for Input Features (X) with Feature Engineering ---")

# Feature Engineering - Create new features from existing ones
print("Adding engineered features...")

# Create interaction features between binary symptoms
X['fever_cough'] = X['Has_Fever'] * X['Has_Cough']
X['fever_fatigue'] = X['Has_Fever'] * X['Has_Fatigue']
X['pain_fatigue'] = X['Has_Pain'] * X['Has_Fatigue']
X['hypertension_diabetes'] = X['Has_Hypertension'] * X['Has_Diabetes']

# Create age groups
X['Age_Group'] = pd.cut(X['Age'],
                       bins=[0, 18, 35, 50, 65, 100],
                       labels=['Child', 'Young_Adult', 'Adult', 'Senior', 'Elderly'])

# Create BMI approximation
X['BMI'] = X['Weight_kg'] / ((X['Height'] if 'Height' in X.columns else X['Weight_kg']/30)**2)

# Vital sign combinations
X['BP_HR_Ratio'] = X['BP_Systolic'] / X['Heart_Rate']
X['Fever_Severity'] = X['Temperature_C'] - 37.0

# Basic disease patterns
X['GERD_Pattern'] = ((X['Age'] > 40) & (X['Has_Pain'] == 1) & (X['Has_Cough'] == 0)).astype(int)
X['Migraine_Pattern'] = ((X['Age'] < 50) & (X['Has_Pain'] == 1) & (X['Has_Fever'] == 0)).astype(int)
X['Osteo_Pattern'] = ((X['Age'] > 55) & (X['Has_Pain'] == 1) & (X['Weight_kg'] > 75)).astype(int)

# ADVANCED DISEASE-SPECIFIC PATTERNS - Much more granular and focused on problem areas
X['GERD_Advanced'] = ((X['Age'] > 40) & (X['Has_Pain'] == 1) & (X['Has_Cough'] == 0) &
                     (X['BP_Systolic'] < 130) & (X['Heart_Rate'] < 80) &
                     (X['Has_Fatigue'] == 0)).astype(int)

X['Migraine_Advanced'] = ((X['Age'] < 50) & (X['Has_Pain'] == 1) & (X['Has_Fever'] == 0) &
                         (X['Heart_Rate'] > 70) & (X['Has_Fatigue'] == 1) &
                         (X['Temperature_C'] < 37.2)).astype(int)

X['Osteo_Advanced'] = ((X['Age'] > 55) & (X['Has_Pain'] == 1) & (X['Weight_kg'] > 75) &
                      (X['Has_Hypertension'] == 0) & (X['Has_Fever'] == 0) &
                      (X['BP_Systolic'] > 110)).astype(int)

X['Cold_Advanced'] = ((X['Has_Fever'] == 1) & (X['Has_Cough'] == 1) &
                     (X['Temperature_C'] < 38.5) & (X['Temperature_C'] > 37.0) &
                     (X['WBC_Count'] < 10) & (X['Has_Fatigue'] == 1)).astype(int)

# EXTREME SPECIFICITY FOR PROBLEM CLASSES
X['GERD_Specific'] = ((X['Age'] > 40) & (X['Has_Pain'] == 1) & (X['Has_Cough'] == 0) &
                      (X['BP_Systolic'] < 130) & (X['Temperature_C'] < 37.0) &
                      (X['WBC_Count'] < 8.0) & (X['Has_Hypertension'] == 0) &
                      (X['Has_Diabetes'] == 0)).astype(int)

X['Migraine_Specific'] = ((X['Age'] < 50) & (X['Has_Pain'] == 1) & (X['Has_Fever'] == 0) &
                          (X['Heart_Rate'] > 70) & (X['Heart_Rate'] < 100) &
                          (X['Has_Fatigue'] == 1) & (X['BP_Systolic'] < 130) &
                          (X['Has_Hypertension'] == 0)).astype(int)

X['Osteo_Specific'] = ((X['Age'] > 55) & (X['Has_Pain'] == 1) & (X['Weight_kg'] > 75) &
                       (X['Has_Fever'] == 0) & (X['Has_Cough'] == 0) &
                       (X['Has_Fatigue'] == 0) & (X['WBC_Count'] < 9.0) &
                       (X['BP_Systolic'] > 120)).astype(int)

X['Cold_Specific'] = ((X['Has_Fever'] == 1) & (X['Has_Cough'] == 1) &
                      (X['Temperature_C'] < 38.0) & (X['Temperature_C'] > 37.2) &
                      (X['WBC_Count'] < 9.0) & (X['WBC_Count'] > 5.0) &
                      (X['Has_Fatigue'] == 1) & (X['Has_Pain'] == 0)).astype(int)

# TRIPLE COMBINATIONS - More complex interactions
X['Fever_Cough_Fatigue'] = X['Has_Fever'] * X['Has_Cough'] * X['Has_Fatigue']
X['Pain_Fever_Fatigue'] = X['Has_Pain'] * X['Has_Fever'] * X['Has_Fatigue']
X['Pain_Cough_Fatigue'] = X['Has_Pain'] * X['Has_Cough'] * X['Has_Fatigue']
X['Vital_Interaction'] = (X['Temperature_C'] * X['Heart_Rate'] * X['BP_Systolic']) / 1000

# MATHEMATICAL TRANSFORMATIONS - Non-linear relationships
X['Temp_Squared'] = (X['Temperature_C'] - 37.0) ** 2
X['BP_Squared'] = ((X['BP_Systolic'] - 120) / 10) ** 2
X['HR_Squared'] = ((X['Heart_Rate'] - 75) / 10) ** 2
X['WBC_Squared'] = ((X['WBC_Count'] - 7.5) / 2) ** 2
X['Glucose_Scaled'] = (X['Glucose_Level'] - 100) / 10

# SYMPTOM COMBINATIONS BY AGE
X['Young_With_Fever'] = ((X['Age'] < 35) & (X['Has_Fever'] == 1)).astype(int)
X['Senior_With_Pain'] = ((X['Age'] > 65) & (X['Has_Pain'] == 1)).astype(int)
X['Middle_With_Fatigue'] = ((X['Age'] > 35) & (X['Age'] < 65) & (X['Has_Fatigue'] == 1)).astype(int)

# Blood count indicators - Specific ranges relevant to infections vs other conditions
X['High_WBC'] = (X['WBC_Count'] > 10.0).astype(int)
X['Low_WBC'] = (X['WBC_Count'] < 5.0).astype(int)
X['Normal_WBC'] = ((X['WBC_Count'] >= 5.0) & (X['WBC_Count'] <= 10.0)).astype(int)

# CONDITIONAL FEATURES - Disease-discriminating conditions
X['Likely_Infection'] = ((X['Has_Fever'] == 1) & (X['WBC_Count'] > 9.0) &
                          (X['Temperature_C'] > 37.5)).astype(int)
X['Likely_Chronic'] = ((X['Has_Fever'] == 0) & (X['Has_Fatigue'] == 1) &
                       (X['WBC_Count'] < 9.0)).astype(int)
X['Likely_Acute'] = ((X['Has_Fever'] == 1) | (X['Has_Pain'] == 1)) & (X['Has_Fatigue'] == 0)

# Count total symptoms as a feature (symptom burden)
X['Symptom_Count'] = (X['Has_Fever'] + X['Has_Cough'] + X['Has_Fatigue'] +
                     X['Has_Pain'] + X['Has_Hypertension'] + X['Has_Diabetes'])

print(f"Created {35} enhanced engineered features")

# Define column types based on the input_features and new engineered features
numerical_features = [
    'Age', 'Weight_kg', 'Temperature_C', 'Heart_Rate', 'BP_Systolic',
    'WBC_Count', 'Glucose_Level',
    # Binary flags
    'Has_Fever', 'Has_Cough', 'Has_Fatigue', 'Has_Pain', 'Has_Hypertension', 'Has_Diabetes',
    # Basic engineered features
    'fever_cough', 'fever_fatigue', 'pain_fatigue', 'hypertension_diabetes',
    'BMI', 'BP_HR_Ratio', 'Fever_Severity',
    'GERD_Pattern', 'Migraine_Pattern', 'Osteo_Pattern',
    # Advanced pattern features
    'GERD_Advanced', 'Migraine_Advanced', 'Osteo_Advanced', 'Cold_Advanced',
    'GERD_Specific', 'Migraine_Specific', 'Osteo_Specific', 'Cold_Specific',
    # Complex interactions
    'Fever_Cough_Fatigue', 'Pain_Fever_Fatigue', 'Pain_Cough_Fatigue', 'Vital_Interaction',
    # Mathematical transformations
    'Temp_Squared', 'BP_Squared', 'HR_Squared', 'WBC_Squared', 'Glucose_Scaled',
    # Age-symptom combinations
    'Young_With_Fever', 'Senior_With_Pain', 'Middle_With_Fatigue',
    # Blood count indicators
    'High_WBC', 'Low_WBC', 'Normal_WBC',
    # Conditional features
    'Likely_Infection', 'Likely_Chronic', 'Symptom_Count'
]

categorical_features = ['Gender', 'Blood_Group', 'Age_Group']

print(f" - Numerical features: {len(numerical_features)}")
print(f" - Categorical features to encode: {categorical_features}")


--- Defining Preprocessing Pipeline for Input Features (X) with Feature Engineering ---
Adding engineered features...
Created 35 enhanced engineered features
 - Numerical features: 49
 - Categorical features to encode: ['Gender', 'Blood_Group', 'Age_Group']


In [6]:
# Cell 5.5: Train-Test Split

print("\n--- Creating Train-Test Split ---")

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_ml_encoded, test_size=0.2, random_state=42
)

# Convert y_train and y_test to DataFrames with the same column names as y_ml_encoded
y_train_encoded_df = pd.DataFrame(y_train, columns=y_ml_encoded.columns)
y_test_encoded_df = pd.DataFrame(y_test, columns=y_ml_encoded.columns)

# Define target columns for model training
ml_target_cols = list(y_ml_encoded.columns)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

print(f"X_train shape: {X_train.shape}")
print(f"y_train_encoded_df shape: {y_train_encoded_df.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test_encoded_df shape: {y_test_encoded_df.shape}")

# Check for columns with only one unique value in training set
varied_cols_in_train = []
for col in y_train_encoded_df.columns:
    if len(y_train_encoded_df[col].unique()) > 1:
        varied_cols_in_train.append(col)
    else:
        print(f"Warning: Column '{col}' has only one unique value in the training set and will be ignored for modeling.")

print(f"Found {len(varied_cols_in_train)} target columns with varied values for model training")


--- Creating Train-Test Split ---
Training set: 12000 samples
Testing set: 3000 samples
X_train shape: (12000, 60)
y_train_encoded_df shape: (12000, 14)
X_test shape: (3000, 60)
y_test_encoded_df shape: (3000, 14)
Found 14 target columns with varied values for model training


In [7]:
# Cell 6: Feature Engineering - Apply AFTER Train-Test Split

print("\n--- Applying Feature Engineering to Train/Test Sets ---")

# Apply the same feature engineering to both train and test sets
for dataset in [X_train, X_test]:
    # Create interaction features between symptoms
    dataset['fever_cough'] = dataset['Has_Fever'] * dataset['Has_Cough']
    dataset['fever_fatigue'] = dataset['Has_Fever'] * dataset['Has_Fatigue']
    dataset['pain_fatigue'] = dataset['Has_Pain'] * dataset['Has_Fatigue']

    # Handle possible missing columns with safe checks
    has_hypertension = 'Has_Hypertension' in dataset.columns
    has_diabetes = 'Has_Diabetes' in dataset.columns
    if has_hypertension and has_diabetes:
        dataset['hypertension_diabetes'] = dataset['Has_Hypertension'] * dataset['Has_Diabetes']

    # Create age groups
    dataset['Age_Group'] = pd.cut(dataset['Age'],
                                bins=[0, 18, 35, 50, 65, 100],
                                labels=['Child', 'Young_Adult', 'Adult', 'Senior', 'Elderly'])

    # BMI calculation with safe division
    height_col = 'Height_cm' if 'Height_cm' in dataset.columns else 'Height' if 'Height' in dataset.columns else None
    if height_col:
        # Convert height to meters and calculate BMI normally
        height_m = dataset[height_col] / 100
        dataset['BMI'] = dataset['Weight_kg'] / (height_m ** 2)
    else:
        # Approximation if no height column exists
        dataset['BMI'] = dataset['Weight_kg'] / ((dataset['Weight_kg'] / 30) ** 2)

    # Vital sign combinations (with safe division)
    dataset['BP_HR_Ratio'] = dataset['BP_Systolic'] / dataset['Heart_Rate'].replace(0, 1)
    dataset['Fever_Severity'] = dataset['Temperature_C'] - 37.0

    # Disease-specific pattern indicators for problematic classes
    dataset['GERD_Pattern'] = ((dataset['Age'] > 40) &
                             (dataset['Has_Pain'] == 1) &
                             (dataset['Has_Cough'] == 0)).astype(int)

    dataset['Migraine_Pattern'] = ((dataset['Age'] < 50) &
                                 (dataset['Has_Pain'] == 1) &
                                 (dataset['Has_Fever'] == 0)).astype(int)

    dataset['Osteo_Pattern'] = ((dataset['Age'] > 55) &
                              (dataset['Has_Pain'] == 1) &
                              (dataset['Weight_kg'] > 75)).astype(int)

    dataset['Cold_Pattern'] = ((dataset['Has_Fever'] == 1) &
                             (dataset['Has_Cough'] == 1) &
                             (dataset['Temperature_C'] < 38.5)).astype(int)

# Update categorical features list to include new categorical feature
categorical_features.append('Age_Group')

print(f"Added 12 engineered features to both training and test sets")
print(f"Updated categorical features list: {categorical_features}")

# Recreate preprocessor with updated categorical features list
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [col for col in X_train.columns if col not in categorical_features]),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)

print("Preprocessor updated with new features")


--- Applying Feature Engineering to Train/Test Sets ---
Added 12 engineered features to both training and test sets
Updated categorical features list: ['Gender', 'Blood_Group', 'Age_Group', 'Age_Group']
Preprocessor updated with new features


In [8]:
# Cell 6.5: Create Mappings for Textual Outputs (from Original Data)

print("\n--- Creating Mappings for Textual Outputs ---")

# Define the mapping target columns (text fields we want to include in mappings)
mapping_target_cols = [
    'Disease_Causes',
    'Instructions_1',
    'Instructions_2',
    'Instructions_3',
    'Personalized_Health_Tips',
    'Polypharmacy_Recommendation'
]

print(f"Defined {len(mapping_target_cols)} text output columns for mapping")

# Use original data to create mappings (using only columns found in training data)
disease_to_text_mappings = {}

# Extract rows from the training indices to avoid data leakage
train_indices = X_train.index
mapping_train = data.loc[train_indices]

# Get unique disease-text combinations from the training set
try:
    # Keep only the columns we need for mapping
    mapping_cols = ['Predicted_Disease'] + mapping_target_cols
    available_cols = [col for col in mapping_cols if col in data.columns]

    # Check which columns were found and which are missing
    missing_cols = set(mapping_cols) - set(available_cols)
    if missing_cols:
        print(f"Warning: Could not find these columns: {missing_cols}")

    mapping_df_unique = mapping_train[available_cols].drop_duplicates(subset=['Predicted_Disease'])

    # Create a dictionary mapping Disease -> Dictionary_of_Texts
    for idx, row in mapping_df_unique.iterrows():
        disease = row['Predicted_Disease']
        disease_texts = {col: row[col] for col in available_cols if col != 'Predicted_Disease'}
        disease_to_text_mappings[disease] = disease_texts

    print(f"Created mappings for {len(disease_to_text_mappings)} diseases found in training data.")

    # Example output - show first disease mapping
    first_disease = list(disease_to_text_mappings.keys())[0]
    print(f"\nExample mapping for '{first_disease}':")
    print(json.dumps(disease_to_text_mappings[first_disease], indent=2))

except Exception as e:
    print(f"Error creating disease mappings: {e}")
    disease_to_text_mappings = {} # Empty dictionary as fallback
    print("Using empty mappings as fallback.")


--- Creating Mappings for Textual Outputs ---
Defined 6 text output columns for mapping
Created mappings for 13 diseases found in training data.

Example mapping for 'Urinary Tract Infection':
{
  "Disease_Causes": "Bacterial infection (commonly E. coli).",
  "Instructions_1": "Complete full course, drink plenty of water.",
  "Instructions_2": "For pain relief only, turns urine orange.",
  "Instructions_3": "For kidney protection/BP.",
  "Personalized_Health_Tips": "Wipe front to back, urinate after intercourse, stay hydrated.",
  "Polypharmacy_Recommendation": "Check for sulfa allergy. Stay hydrated."
}


In [9]:
# Cell 6.6: Create even more aggressive class weights for problem classes

print("\n--- Ultra-Aggressive Class Balancing ---")

# Identify problematic disease classes from previous run
problematic_classes = ['GERD', 'Migraine', 'Osteoarthritis', 'Common Cold']
medium_classes = ['Healthy', 'Hypothyroidism', 'Anxiety Disorder']
print(f"Focusing on ultra-aggressive improvement for: {', '.join(problematic_classes)}")

# Calculate class weights for the Predicted_Disease column
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight

# Get disease labels and compute weights
if 'Predicted_Disease' in y_train_encoded_df.columns:
    disease_counts = Counter(y_train_encoded_df['Predicted_Disease'])
    unique_classes = sorted(disease_counts.keys())

    # ULTRA-aggressive oversampling weights for problematic classes
    custom_class_weights = {i: 1.0 for i in unique_classes}
    for disease_code in unique_classes:
        # Decode to get disease name
        disease_name = target_encoders['Predicted_Disease'].inverse_transform([disease_code])[0]
        if disease_name in problematic_classes:
            # EXTREME weight increase (15x) for problematic classes
            custom_class_weights[disease_code] = 15.0
            print(f"Increased weight for {disease_name} (code {disease_code}) to 15.0")
        elif disease_name in medium_classes:
            # Significant weight increase (5x) for medium-performing classes
            custom_class_weights[disease_code] = 5.0
            print(f"Increased weight for {disease_name} (code {disease_code}) to 5.0")

    print(f"Created extreme custom class weights for {len(unique_classes)} disease classes")

    # Save weights for model training
    sample_weights = np.ones(len(y_train_encoded_df))
    for i, val in enumerate(y_train_encoded_df['Predicted_Disease']):
        sample_weights[i] = custom_class_weights[val]

    print(f"Created sample weights array with shape {sample_weights.shape}")
else:
    print("Warning: 'Predicted_Disease' not found in encoded training data")
    sample_weights = None


--- Ultra-Aggressive Class Balancing ---
Focusing on ultra-aggressive improvement for: GERD, Migraine, Osteoarthritis, Common Cold
Increased weight for Anxiety Disorder (code 0) to 5.0
Increased weight for Common Cold (code 3) to 15.0
Increased weight for GERD (code 5) to 15.0
Increased weight for Healthy (code 6) to 5.0
Increased weight for Hypothyroidism (code 8) to 5.0
Increased weight for Migraine (code 10) to 15.0
Increased weight for Osteoarthritis (code 11) to 15.0
Created extreme custom class weights for 13 disease classes
Created sample weights array with shape (12000,)


In [10]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [11]:
!pip install tqdm



In [23]:
# Cell 7: Optimized LightGBM with Bayesian Optimization and Progress Bar

print("\n--- Training ML Model with Bayesian Optimization (Optimized) ---")

# Import Bayesian optimization with early stopping
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.callbacks import DeltaYStopper
from tqdm.notebook import tqdm
import time

# Define more efficient search space
search_space = {
    'learning_rate': Real(0.01, 0.1, prior='log-uniform'),    # Higher, narrower range
    'n_estimators': Integer(100, 500),                        # Reduced range significantly
    'num_leaves': Integer(31, 127),                           # Reduced upper bound
    'max_depth': Integer(5, 15),                              # Narrower range
    'min_child_samples': Integer(5, 20),                      # Smaller range
    'subsample': Real(0.6, 0.9),                              # Narrower range
    'colsample_bytree': Real(0.6, 0.9),                       # Narrower range
    'reg_alpha': Real(0.01, 0.1, prior='log-uniform'),        # Smaller range
    'reg_lambda': Real(0.01, 0.1, prior='log-uniform'),       # Smaller range
}

# Preprocessor setup with all engineered features (keep as is)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)

# Process data for optimization
X_processed = preprocessor.fit_transform(X_train)
y_disease = y_train_encoded_df['Predicted_Disease']

# Configure base LightGBM - switching to 'gbdt' for better speed
disease_model = lgb.LGBMClassifier(
    objective='multiclass',
    boosting_type='gbdt',  # Changed from 'dart' to faster 'gbdt'
    verbose=-1,
    random_state=42,
    n_jobs=-1
)

# Create early stopping callback for the optimization
early_stopper = DeltaYStopper(delta=0.001, n_best=5)

# Create a custom callback for the progress bar
class ProgressBarCallback:
    def __init__(self, total_iters=10):
        self.pbar = tqdm(total=total_iters, desc="Bayesian Optimization Progress")
        self.iter_count = 0
        self.start_time = time.time()

    def __call__(self, res):
        self.iter_count += 1
        elapsed = time.time() - self.start_time
        avg_time = elapsed / self.iter_count

        # Update progress bar with metrics
        if hasattr(res, 'func_vals') and len(res.func_vals) > 0:
            best_score = -min(res.func_vals)  # Convert minimization to maximization
            self.pbar.set_postfix({
                'best_score': f'{best_score:.4f}',
                'iter': self.iter_count,
                'avg_time': f'{avg_time:.1f}s'
            })

        self.pbar.update(1)
        return True

# Number of optimization iterations
n_iter = 10

# Create Bayesian optimizer with fewer iterations
optimizer = BayesSearchCV(
    disease_model,
    search_space,
    n_iter=n_iter,     # Reduced from 25 to 10
    cv=3,
    n_jobs=-1,
    verbose=0,  # Set to 0 to avoid cluttering the progress bar
    scoring='accuracy',
    random_state=42,
    return_train_score=True  # Helpful for diagnostics
)

# Run optimization with early stopping and progress bar
print("Starting Bayesian optimization with improved efficiency...")
progress_callback = ProgressBarCallback(total_iters=n_iter)
optimizer.fit(X_processed, y_disease,
              sample_weight=sample_weights,
              callback=[early_stopper, progress_callback])

# Get best model and parameters
best_params = optimizer.best_params_
best_score = optimizer.best_score_
print(f"Best CV accuracy: {best_score:.4f}")
print(f"Best parameters: {best_params}")

# If you want the final model to still use 'dart' boosting, add it manually to best_params
best_params['boosting_type'] = 'dart'  # Comment this out if you want to keep using 'gbdt'

# Train final model with best parameters
final_model = lgb.LGBMClassifier(
    **best_params,
    objective='multiclass',
    random_state=42,
    n_jobs=-1
)

# Apply class weighting for problematic diseases
final_model.fit(X_processed, y_disease, sample_weight=sample_weights)

# Define valid target columns from the varied columns in training
valid_target_cols = varied_cols_in_train  # Use the list created in Cell 5.5

# Train remaining target models with standard parameters
pipeline_models = {'Predicted_Disease': final_model}
for col in valid_target_cols:
    if col == 'Predicted_Disease':
        continue
    # Define standard parameters since lgb_params might not be defined
    lgb_params = {
        'objective': 'multiclass' if len(y_train_encoded_df[col].unique()) > 2 else 'binary',
        'boosting_type': 'gbdt',  # Use faster algorithm for secondary targets
        'learning_rate': 0.05,
        'n_estimators': 200,
        'random_state': 42,
        'n_jobs': -1
    }
    # Use standard parameters for other targets
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(X_processed, y_train_encoded_df[col])
    pipeline_models[col] = model

# Create final pipeline
pipeline = {
    'preprocessor': preprocessor,
    'models': pipeline_models,
    'target_columns': valid_target_cols,
    'predict': lambda X: np.column_stack([
        pipeline['models'][col].predict(
            pipeline['preprocessor'].transform(X)
        ) for col in pipeline['target_columns']
    ])
}

print("Optimized model training complete")


--- Training ML Model with Bayesian Optimization (Optimized) ---
Starting Bayesian optimization with improved efficiency...


Bayesian Optimization Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Best CV accuracy: 0.6857
Best parameters: OrderedDict([('colsample_bytree', 0.7230311876559942), ('learning_rate', 0.0534226888747111), ('max_depth', 14), ('min_child_samples', 10), ('n_estimators', 368), ('num_leaves', 71), ('reg_alpha', 0.022435271778548954), ('reg_lambda', 0.05489139072630632), ('subsample', 0.6913389933109518)])
Optimized model training complete


In [24]:
# Cell 7.5: Feature Importance Analysis

print("\n--- Feature Importance Analysis ---")
disease_model = pipeline['models']['Predicted_Disease']

# Get and plot feature importances
feature_names = []
if hasattr(preprocessor, 'get_feature_names_out'):
    feature_names = preprocessor.get_feature_names_out()
else:
    # Fallback for older scikit-learn versions
    feature_names = ["feature_" + str(i) for i in range(X_processed.shape[1])]

# Get importances for the disease prediction model
importances = disease_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Print top 20 most important features
print("Top 20 most important features:")
for i in range(min(20, len(feature_names))):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

# Analyze problematic classes
problem_classes = ['GERD', 'Migraine', 'Osteoarthritis', 'Common Cold']
problem_class_indices = [list(target_encoders['Predicted_Disease'].classes_).index(cls)
                         for cls in problem_classes if cls in target_encoders['Predicted_Disease'].classes_]

# For each problem class, show key predictive features
print("\nClass-specific important features:")
for cls, idx in zip(problem_classes, problem_class_indices):
    print(f"\n{cls} predictive features:")
    # Get feature importance for this specific class
    if hasattr(disease_model, 'feature_importances_'):
        for i in range(5):  # Show top 5 features
            print(f"  - {feature_names[indices[i]]}")


--- Feature Importance Analysis ---
Top 20 most important features:
1. num__Vital_Interaction: 27570.0000
2. num__WBC_Count: 25074.0000
3. num__Glucose_Level: 24013.0000
4. num__BP_HR_Ratio: 23549.0000
5. num__Weight_kg: 21434.0000
6. num__Temperature_C: 21167.0000
7. num__Age: 19584.0000
8. num__BP_Systolic: 16308.0000
9. num__Heart_Rate: 15955.0000
10. num__WBC_Squared: 15124.0000
11. num__BMI: 12906.0000
12. num__BP_Squared: 9968.0000
13. num__HR_Squared: 8737.0000
14. num__Symptom_Count: 7008.0000
15. num__Temp_Squared: 6740.0000
16. num__Glucose_Scaled: 6426.0000
17. num__Fever_Severity: 5780.0000
18. num__Has_Pain: 5508.0000
19. num__Has_Cough: 5087.0000
20. num__Has_Fever: 2943.0000

Class-specific important features:

GERD predictive features:
  - num__Vital_Interaction
  - num__WBC_Count
  - num__Glucose_Level
  - num__BP_HR_Ratio
  - num__Weight_kg

Migraine predictive features:
  - num__Vital_Interaction
  - num__WBC_Count
  - num__Glucose_Level
  - num__BP_HR_Ratio
  - num

In [25]:
# Cell 8: Prediction & Evaluation

print("\n--- Step: Prediction & Evaluation ---")

if 'pipeline' not in locals() or not isinstance(pipeline, dict) or 'predict' not in pipeline:
    print("⚠️ Error: Pipeline not properly initialized. Please run model training first.")
else:
    try:
        print("\nMaking predictions on the test set...")
        y_pred_encoded = pipeline['predict'](X_test)

        # Convert prediction numpy array to DataFrame with correct column names
        y_pred_encoded_df = pd.DataFrame(y_pred_encoded, columns=pipeline['target_columns'])

        # Convert float predictions to integers (fix for LabelEncoder)
        for col in y_pred_encoded_df.columns:
            y_pred_encoded_df[col] = y_pred_encoded_df[col].round().astype(int)

        # --- Evaluate Predicted_Disease (Primary Target) ---
        if 'Predicted_Disease' in y_pred_encoded_df.columns:
            disease_true = y_test_encoded_df['Predicted_Disease']
            disease_pred = y_pred_encoded_df['Predicted_Disease']
            disease_accuracy = accuracy_score(disease_true, disease_pred)
            print(f"\n--- Predicted_Disease Evaluation ---")
            print(f"Accuracy: {disease_accuracy:.4f}")

            if disease_accuracy > 0.90:
                print(f"✅ Interpretation: Predicted_Disease accuracy target (>90%) MET! (Expected on this synthetic data)")
            elif disease_accuracy > 0.80:
                print(f"✓ Interpretation: Predicted_Disease accuracy ({disease_accuracy:.1%}) meets target of >80%.")
            else:
                print(f"⚠️ Interpretation: Predicted_Disease accuracy ({disease_accuracy:.1%}) is BELOW target. Check generation/model.")

            # Decode for classification report
            disease_encoder = target_encoders['Predicted_Disease']

            # Ensure values are within valid range for the encoder
            valid_indices = list(range(len(disease_encoder.classes_)))
            disease_pred_valid = np.clip(disease_pred, min(valid_indices), max(valid_indices))

            disease_true_labels = disease_encoder.inverse_transform(disease_true)
            disease_pred_labels = disease_encoder.inverse_transform(disease_pred_valid.astype(int))

            print(f"\nClassification Report (Predicted_Disease - Decoded):")
            print(classification_report(disease_true_labels, disease_pred_labels,
                                      target_names=disease_encoder.classes_, # Show class names
                                      zero_division=0, digits=3))
        else:
            print("'Predicted_Disease' column not found in the predictions.")

        # --- Evaluate Other ML Targets ---
        print("\n--- Evaluation of Other ML Target Columns (Accuracy) ---")
        other_accuracies = {}
        for col in pipeline['target_columns']:
            if col == 'Predicted_Disease': continue # Skip primary target
            if col in y_test_encoded_df.columns:
                col_true = y_test_encoded_df[col]
                col_pred = y_pred_encoded_df[col]
                acc = accuracy_score(col_true, col_pred)
                other_accuracies[col] = acc
                print(f"Accuracy for '{col}': {acc:.4f}")

        avg_other_accuracy = np.mean(list(other_accuracies.values())) if other_accuracies else 1.0
        print(f"\nAverage Accuracy (Other ML Targets): {avg_other_accuracy:.4f}")

        # --- Overall Exact Match Ratio (Manual Calculation) ---
        print("\n--- Overall Evaluation (Exact Match for ML Targets) ---")
        try:
            # Only compare columns present in both DataFrames
            common_cols = [col for col in pipeline['target_columns'] if col in y_test_encoded_df.columns]
            correct_elements = (y_test_encoded_df[common_cols].values == y_pred_encoded_df[common_cols].values)
            correct_samples = np.all(correct_elements, axis=1)
            exact_match_ratio = np.mean(correct_samples)
            print(f"Overall Accuracy (Exact Match Ratio - ML Targets): {exact_match_ratio:.4f}")
        except Exception as e:
            print(f"Could not manually calculate Exact Match Ratio: {e}")
            exact_match_ratio = -1
    except Exception as pred_error:
        print(f"Error during prediction/evaluation: {pred_error}")
        import traceback
        traceback.print_exc()


--- Step: Prediction & Evaluation ---

Making predictions on the test set...

--- Predicted_Disease Evaluation ---
Accuracy: 0.6847
⚠️ Interpretation: Predicted_Disease accuracy (68.5%) is BELOW target. Check generation/model.

Classification Report (Predicted_Disease - Decoded):
                         precision    recall  f1-score   support

       Anxiety Disorder      0.667     0.700     0.683       240
    Asthma Exacerbation      0.902     0.609     0.727       256
    Bacterial Pneumonia      0.966     0.908     0.936       218
            Common Cold      0.474     0.745     0.579       208
        Diabetes Type 2      0.988     0.988     0.988       251
                   GERD      0.408     0.474     0.439       211
                Healthy      0.662     0.634     0.648       216
           Hypertension      0.920     0.855     0.886       241
         Hypothyroidism      0.692     0.635     0.662       230
              Influenza      0.846     0.753     0.797       255
  

In [30]:
# Cell 9: Saving Artifacts with Lambda Function Fix

print("\n--- Step: Saving Artifacts ---")
pipeline_file = 'synthetic_v2_pipeline.joblib'
encoders_file = 'synthetic_v2_target_encoders.joblib'
mappings_file = 'synthetic_v2_disease_mappings.joblib'

try:
    # Create a pickle-friendly version of the pipeline without lambda functions
    pickle_friendly_pipeline = {
        'preprocessor': pipeline['preprocessor'],
        'models': pipeline['models'],
        'target_columns': pipeline['target_columns']
    }

    # Define a placeholder function that will be replaced during loading
    def predict_placeholder(X):
        print("This is a placeholder. Pipeline needs to be reconstructed after loading.")
        return None

    # Save prediction function code as a string for reference
    predict_function_code = '''
    def reconstruct_predict_function(pipeline):
        def predict_function(X):
            return np.column_stack([
                pipeline['models'][col].predict(
                    pipeline['preprocessor'].transform(X)
                ) for col in pipeline['target_columns']
            ])
        return predict_function
    '''

    pickle_friendly_pipeline['predict_function_code'] = predict_function_code
    pickle_friendly_pipeline['predict'] = predict_placeholder

    # Save the pickle-friendly pipeline
    joblib.dump(pickle_friendly_pipeline, pipeline_file)
    print(f"Pipeline saved to {pipeline_file}")

    # Save the target encoders
    joblib.dump(target_encoders, encoders_file)
    print(f"Target encoders saved to {encoders_file}")

    # Save the disease-to-text mappings
    joblib.dump(disease_to_text_mappings, mappings_file)
    print(f"Disease-to-text mappings saved to {mappings_file}")

    print("\nArtifacts saved successfully.")
    print("You will need these files for deployment/prediction:")
    print(f" - {pipeline_file}")
    print(f" - {encoders_file}")
    print(f" - {mappings_file}")
    print("\nIMPORTANT: When loading the pipeline, you will need to reconstruct the predict function:")
    print(predict_function_code)

except Exception as e:
    print(f"\nError saving artifacts: {e}")
    import traceback
    traceback.print_exc()


--- Step: Saving Artifacts ---
Pipeline saved to synthetic_v2_pipeline.joblib
Target encoders saved to synthetic_v2_target_encoders.joblib
Disease-to-text mappings saved to synthetic_v2_disease_mappings.joblib

Artifacts saved successfully.
You will need these files for deployment/prediction:
 - synthetic_v2_pipeline.joblib
 - synthetic_v2_target_encoders.joblib
 - synthetic_v2_disease_mappings.joblib

IMPORTANT: When loading the pipeline, you will need to reconstruct the predict function:

    def reconstruct_predict_function(pipeline):
        def predict_function(X):
            return np.column_stack([
                pipeline['models'][col].predict(
                    pipeline['preprocessor'].transform(X)
                ) for col in pipeline['target_columns']
            ])
        return predict_function
    


In [31]:
# Code to load and reconstruct the pipeline
import joblib
import numpy as np

# Load the saved pipeline
loaded_pipeline = joblib.load('synthetic_v2_pipeline.joblib')

# Reconstruct the predict function
def reconstruct_predict_function(pipeline):
    def predict_function(X):
        return np.column_stack([
            pipeline['models'][col].predict(
                pipeline['preprocessor'].transform(X)
            ) for col in pipeline['target_columns']
        ])
    return predict_function

# Add the reconstructed predict function to the pipeline
loaded_pipeline['predict'] = reconstruct_predict_function(loaded_pipeline)

In [32]:
# Cell 10: Updated Example Prediction Function with ALL Advanced Features

print("\n--- Step: Example Prediction Function ---")

# Define input_features before using them in the prediction function
input_features = [
    'Patient_ID', 'Age', 'Gender', 'Blood_Group', 'Weight_kg',
    'Has_Fever', 'Has_Cough', 'Has_Fatigue', 'Has_Pain',
    'Has_Hypertension', 'Has_Diabetes', 'Temperature_C',
    'Heart_Rate', 'BP_Systolic', 'WBC_Count', 'Glucose_Level'
    # Note: All engineered features will be created within the function
]

# Load artifacts (demonstration)
try:
    loaded_pipeline = joblib.load(pipeline_file)
    loaded_encoders = joblib.load(encoders_file)
    loaded_mappings = joblib.load(mappings_file)
    print("Artifacts reloaded for prediction function.")
except Exception as e:
    print(f"Error reloading artifacts: {e}. Prediction function cannot be tested.")
    # Define a dummy function or exit if artifacts are needed
    def predict_synthetic_patient_outcomes(*args, **kwargs): return {"Error": "Artifacts not loaded"}

if 'loaded_pipeline' in locals(): # Proceed only if artifacts loaded
    def predict_synthetic_patient_outcomes(patient_features):
        """
        Predicts outcomes for a single patient using the loaded pipeline, encoders, and mappings.
        """
        try:
            # Add missing fields and required engineered features
            enriched_features = patient_features.copy()
            age = patient_features.get('Age', 45)

            # ====== BASIC FEATURES ======
            # Create interaction features between binary symptoms
            enriched_features['fever_cough'] = patient_features.get('Has_Fever', 0) * patient_features.get('Has_Cough', 0)
            enriched_features['fever_fatigue'] = patient_features.get('Has_Fever', 0) * patient_features.get('Has_Fatigue', 0)
            enriched_features['pain_fatigue'] = patient_features.get('Has_Pain', 0) * patient_features.get('Has_Fatigue', 0)
            enriched_features['hypertension_diabetes'] = patient_features.get('Has_Hypertension', 0) * patient_features.get('Has_Diabetes', 0)

            # Create age groups
            if age <= 18:
                enriched_features['Age_Group'] = 'Child'
            elif age <= 35:
                enriched_features['Age_Group'] = 'Young_Adult'
            elif age <= 50:
                enriched_features['Age_Group'] = 'Adult'
            elif age <= 65:
                enriched_features['Age_Group'] = 'Senior'
            else:
                enriched_features['Age_Group'] = 'Elderly'

            # Create BMI (approximation)
            enriched_features['BMI'] = patient_features.get('Weight_kg', 70) / ((patient_features.get('Weight_kg', 70) / 30) ** 2)

            # Vital sign combinations
            enriched_features['BP_HR_Ratio'] = patient_features.get('BP_Systolic', 120) / max(patient_features.get('Heart_Rate', 80), 1)
            enriched_features['Fever_Severity'] = patient_features.get('Temperature_C', 37.0) - 37.0

            # Basic disease patterns
            enriched_features['GERD_Pattern'] = 1 if (age > 40 and patient_features.get('Has_Pain', 0) == 1
                                                   and patient_features.get('Has_Cough', 0) == 0) else 0
            enriched_features['Migraine_Pattern'] = 1 if (age < 50 and patient_features.get('Has_Pain', 0) == 1
                                                       and patient_features.get('Has_Fever', 0) == 0) else 0
            enriched_features['Osteo_Pattern'] = 1 if (age > 55 and patient_features.get('Has_Pain', 0) == 1
                                                    and patient_features.get('Weight_kg', 70) > 75) else 0
            enriched_features['Cold_Pattern'] = 1 if (patient_features.get('Has_Fever', 0) == 1
                                                   and patient_features.get('Has_Cough', 0) == 1
                                                   and patient_features.get('Temperature_C', 37.0) < 38.5) else 0

            # ====== ADVANCED DISEASE-SPECIFIC PATTERNS ======
            enriched_features['GERD_Advanced'] = 1 if (age > 40 and patient_features.get('Has_Pain', 0) == 1
                                               and patient_features.get('Has_Cough', 0) == 0
                                               and patient_features.get('BP_Systolic', 120) < 130
                                               and patient_features.get('Heart_Rate', 80) < 80
                                               and patient_features.get('Has_Fatigue', 0) == 0) else 0

            enriched_features['Migraine_Advanced'] = 1 if (age < 50 and patient_features.get('Has_Pain', 0) == 1
                                                  and patient_features.get('Has_Fever', 0) == 0
                                                  and patient_features.get('Heart_Rate', 80) > 70
                                                  and patient_features.get('Has_Fatigue', 1) == 1
                                                  and patient_features.get('Temperature_C', 37.0) < 37.2) else 0

            enriched_features['Osteo_Advanced'] = 1 if (age > 55 and patient_features.get('Has_Pain', 0) == 1
                                               and patient_features.get('Weight_kg', 70) > 75
                                               and patient_features.get('Has_Hypertension', 0) == 0
                                               and patient_features.get('Has_Fever', 0) == 0
                                               and patient_features.get('BP_Systolic', 120) > 110) else 0

            enriched_features['Cold_Advanced'] = 1 if (patient_features.get('Has_Fever', 0) == 1
                                              and patient_features.get('Has_Cough', 0) == 1
                                              and patient_features.get('Temperature_C', 37.0) < 38.5
                                              and patient_features.get('Temperature_C', 37.0) > 37.0
                                              and patient_features.get('WBC_Count', 7.0) < 10
                                              and patient_features.get('Has_Fatigue', 0) == 1) else 0

            # ====== EXTREMELY SPECIFIC PATTERNS ======
            enriched_features['GERD_Specific'] = 1 if (age > 40 and patient_features.get('Has_Pain', 0) == 1
                                               and patient_features.get('Has_Cough', 0) == 0
                                               and patient_features.get('BP_Systolic', 120) < 130
                                               and patient_features.get('Temperature_C', 37.0) < 37.0
                                               and patient_features.get('WBC_Count', 7.0) < 8.0
                                               and patient_features.get('Has_Hypertension', 0) == 0
                                               and patient_features.get('Has_Diabetes', 0) == 0) else 0

            enriched_features['Migraine_Specific'] = 1 if (age < 50 and patient_features.get('Has_Pain', 0) == 1
                                                 and patient_features.get('Has_Fever', 0) == 0
                                                 and patient_features.get('Heart_Rate', 80) > 70
                                                 and patient_features.get('Heart_Rate', 80) < 100
                                                 and patient_features.get('Has_Fatigue', 0) == 1
                                                 and patient_features.get('BP_Systolic', 120) < 130
                                                 and patient_features.get('Has_Hypertension', 0) == 0) else 0

            enriched_features['Osteo_Specific'] = 1 if (age > 55 and patient_features.get('Has_Pain', 0) == 1
                                                and patient_features.get('Weight_kg', 70) > 75
                                                and patient_features.get('Has_Fever', 0) == 0
                                                and patient_features.get('Has_Cough', 0) == 0
                                                and patient_features.get('Has_Fatigue', 0) == 0
                                                and patient_features.get('WBC_Count', 7.0) < 9.0
                                                and patient_features.get('BP_Systolic', 120) > 120) else 0

            enriched_features['Cold_Specific'] = 1 if (patient_features.get('Has_Fever', 0) == 1
                                              and patient_features.get('Has_Cough', 0) == 1
                                              and patient_features.get('Temperature_C', 37.0) < 38.0
                                              and patient_features.get('Temperature_C', 37.0) > 37.2
                                              and patient_features.get('WBC_Count', 7.0) < 9.0
                                              and patient_features.get('WBC_Count', 7.0) > 5.0
                                              and patient_features.get('Has_Fatigue', 0) == 1
                                              and patient_features.get('Has_Pain', 0) == 0) else 0

            # ====== TRIPLE COMBINATIONS ======
            enriched_features['Fever_Cough_Fatigue'] = (patient_features.get('Has_Fever', 0) *
                                                      patient_features.get('Has_Cough', 0) *
                                                      patient_features.get('Has_Fatigue', 0))

            enriched_features['Pain_Fever_Fatigue'] = (patient_features.get('Has_Pain', 0) *
                                                     patient_features.get('Has_Fever', 0) *
                                                     patient_features.get('Has_Fatigue', 0))

            enriched_features['Pain_Cough_Fatigue'] = (patient_features.get('Has_Pain', 0) *
                                                     patient_features.get('Has_Cough', 0) *
                                                     patient_features.get('Has_Fatigue', 0))

            enriched_features['Vital_Interaction'] = (patient_features.get('Temperature_C', 37.0) *
                                                    patient_features.get('Heart_Rate', 80) *
                                                    patient_features.get('BP_Systolic', 120)) / 1000

            # ====== MATHEMATICAL TRANSFORMATIONS ======
            enriched_features['Temp_Squared'] = (patient_features.get('Temperature_C', 37.0) - 37.0) ** 2
            enriched_features['BP_Squared'] = ((patient_features.get('BP_Systolic', 120) - 120) / 10) ** 2
            enriched_features['HR_Squared'] = ((patient_features.get('Heart_Rate', 80) - 75) / 10) ** 2
            enriched_features['WBC_Squared'] = ((patient_features.get('WBC_Count', 7.5) - 7.5) / 2) ** 2
            enriched_features['Glucose_Scaled'] = (patient_features.get('Glucose_Level', 100) - 100) / 10

            # ====== AGE-SYMPTOM COMBINATIONS ======
            enriched_features['Young_With_Fever'] = 1 if (age < 35 and patient_features.get('Has_Fever', 0) == 1) else 0
            enriched_features['Senior_With_Pain'] = 1 if (age > 65 and patient_features.get('Has_Pain', 0) == 1) else 0
            enriched_features['Middle_With_Fatigue'] = 1 if (age > 35 and age < 65 and
                                                         patient_features.get('Has_Fatigue', 0) == 1) else 0

            # ====== BLOOD COUNT INDICATORS ======
            enriched_features['High_WBC'] = 1 if patient_features.get('WBC_Count', 7.0) > 10.0 else 0
            enriched_features['Low_WBC'] = 1 if patient_features.get('WBC_Count', 7.0) < 5.0 else 0
            enriched_features['Normal_WBC'] = 1 if (patient_features.get('WBC_Count', 7.0) >= 5.0 and
                                                  patient_features.get('WBC_Count', 7.0) <= 10.0) else 0

            # ====== CONDITIONAL FEATURES ======
            enriched_features['Likely_Infection'] = 1 if (patient_features.get('Has_Fever', 0) == 1 and
                                                        patient_features.get('WBC_Count', 7.0) > 9.0 and
                                                        patient_features.get('Temperature_C', 37.0) > 37.5) else 0

            enriched_features['Likely_Chronic'] = 1 if (patient_features.get('Has_Fever', 0) == 0 and
                                                      patient_features.get('Has_Fatigue', 0) == 1 and
                                                      patient_features.get('WBC_Count', 7.0) < 9.0) else 0

            enriched_features['Likely_Acute'] = 1 if ((patient_features.get('Has_Fever', 0) == 1 or
                                                    patient_features.get('Has_Pain', 0) == 1) and
                                                    patient_features.get('Has_Fatigue', 0) == 0) else 0

            # ====== SYMPTOM COUNT ======
            enriched_features['Symptom_Count'] = (patient_features.get('Has_Fever', 0) +
                                                patient_features.get('Has_Cough', 0) +
                                                patient_features.get('Has_Fatigue', 0) +
                                                patient_features.get('Has_Pain', 0) +
                                                patient_features.get('Has_Hypertension', 0) +
                                                patient_features.get('Has_Diabetes', 0))

            # Add Patient_ID if missing (required by some models)
            if 'Patient_ID' not in enriched_features:
                enriched_features['Patient_ID'] = 0  # Placeholder value

            # 1. Create DataFrame from enriched features
            input_df = pd.DataFrame([enriched_features])

            # 2. Predict using the loaded pipeline components
            X_processed = loaded_pipeline['preprocessor'].transform(input_df)

            # Run prediction for each model
            predictions = []
            for col in loaded_pipeline['target_columns']:
                model = loaded_pipeline['models'][col]
                pred = model.predict(X_processed)[0]  # Get single prediction value
                predictions.append(pred)

            # Convert to DataFrame with column names
            pred_encoded_df = pd.DataFrame([predictions], columns=loaded_pipeline['target_columns'])

            # 3. Decode ML Predictions
            predictions_decoded = {}
            predicted_disease_label = "Error: Disease Not Predicted"

            for col in pred_encoded_df.columns:
                encoder = loaded_encoders.get(col)
                if encoder:
                    encoded_value = int(pred_encoded_df[col].iloc[0])  # Ensure integer
                    decoded_value = encoder.inverse_transform([encoded_value])[0]
                    predictions_decoded[col] = decoded_value
                    # Store the predicted disease label for mapping lookup
                    if col == 'Predicted_Disease':
                        predicted_disease_label = decoded_value
                else:
                    predictions_decoded[col] = f"Error: No encoder found for {col}"

            # 4. Lookup Mapped Textual Outputs
            mapped_outputs = loaded_mappings.get(predicted_disease_label, {})
            if not mapped_outputs:
                print(f"Warning: No mapping found for predicted disease '{predicted_disease_label}'. Returning defaults.")
                # Provide default values
                for map_col in mapping_target_cols:
                    predictions_decoded[map_col] = "Mapping Unavailable"
            else:
                for map_col in mapping_target_cols:
                    predictions_decoded[map_col] = mapped_outputs.get(map_col, f"Info unavailable for {map_col}")

            # 5. Combine all results
            final_results = {}
            all_output_cols = list(pred_encoded_df.columns) + mapping_target_cols
            for col in all_output_cols:
                if col in predictions_decoded:
                    final_results[col] = predictions_decoded[col]

            return final_results

        except Exception as e:
            import traceback
            traceback_str = traceback.format_exc()
            return {"Error": f"Prediction failed: {str(e)}", "Traceback": traceback_str}


    # --- Example Usage ---
    print("\n--- Example Prediction ---")
    # Create sample input matching one of the disease profiles (e.g., Influenza-like)
    sample_input = {
        'Age': 45, 'Gender': 'Male', 'Blood_Group': 'O+', 'Weight_kg': 80.5,
        'Has_Fever': 1, 'Has_Cough': 1, 'Has_Fatigue': 1, 'Has_Pain': 1, # Simulating Influenza
        'Has_Hypertension': 0, 'Has_Diabetes': 0,
        'Temperature_C': 38.6, 'Heart_Rate': 88, 'BP_Systolic': 121,
        'WBC_Count': 9.5, 'Glucose_Level': 102
    }

    prediction = predict_synthetic_patient_outcomes(sample_input)
    print(json.dumps(prediction, indent=2))

    # Example 2 (Simulating Hypertension-like)
    print("\n--- Example Prediction 2 ---")
    sample_input_2 = {
        'Age': 65, 'Gender': 'Female', 'Blood_Group': 'A-', 'Weight_kg': 95.0,
        'Has_Fever': 0, 'Has_Cough': 0, 'Has_Fatigue': 0, 'Has_Pain': 0, # Often asymptomatic
        'Has_Hypertension': 1, 'Has_Diabetes': 0, # Known HTN
        'Temperature_C': 36.7, 'Heart_Rate': 71, 'BP_Systolic': 148, # High BP
        'WBC_Count': 6.8, 'Glucose_Level': 94
    }
    prediction_2 = predict_synthetic_patient_outcomes(sample_input_2)
    print(json.dumps(prediction_2, indent=2))


--- Step: Example Prediction Function ---
Artifacts reloaded for prediction function.

--- Example Prediction ---
{
  "Predicted_Disease": "Influenza",
  "Medicine_1": "Oseltamivir",
  "Dosage_1": "75mg",
  "Frequency_1": "Twice daily",
  "Duration_1": "5 days",
  "Medicine_2": "Acetaminophen",
  "Dosage_2": "650mg",
  "Frequency_2": "Every 6 hours",
  "Duration_2": "As needed",
  "Medicine_3": "Lisinopril",
  "Dosage_3": "5mg",
  "Frequency_3": "Once daily",
  "Duration_3": "Ongoing",
  "Polypharmacy_Risk": "Medium",
  "Disease_Causes": "Influenza virus infection.",
  "Instructions_1": "Start within 48h of symptoms.",
  "Instructions_2": "For fever/pain relief.",
  "Instructions_3": "For kidney protection/BP.",
  "Personalized_Health_Tips": "Get annual flu shot, rest, fluids, avoid contact.",
  "Polypharmacy_Recommendation": "Review potential interactions, especially Oseltamivir."
}

--- Example Prediction 2 ---
{
  "Predicted_Disease": "Hypertension",
  "Medicine_1": "Lisinopril",
 

In [33]:
# Cell 11: Final Notes
print("\n--- Final Notes ---")
print(f"Model training and evaluation complete using '{file_path}'.")
print("High accuracy achieved is expected due to the SYNTHETIC nature of the data.")
print("The model learned the patterns deliberately embedded during generation.")
print("REMINDER: This model and dataset are for development/pipeline testing ONLY, NOT for real medical use.")
print(f"Saved artifacts ({pipeline_file}, {encoders_file}, {mappings_file}) are ready for use in deployment.")


--- Final Notes ---
Model training and evaluation complete using '/content/Synthetic_Medical_Dataset_v2.csv'.
High accuracy achieved is expected due to the SYNTHETIC nature of the data.
The model learned the patterns deliberately embedded during generation.
REMINDER: This model and dataset are for development/pipeline testing ONLY, NOT for real medical use.
Saved artifacts (synthetic_v2_pipeline.joblib, synthetic_v2_target_encoders.joblib, synthetic_v2_disease_mappings.joblib) are ready for use in deployment.


In [34]:
# Cell 11.5: Specialist Models for Problem Classes

print("\n--- Training Specialist Models for Problem Classes ---")

# Identify problem classes
problem_classes = ['GERD', 'Migraine', 'Osteoarthritis', 'Common Cold']
problem_target = 'Predicted_Disease'

# Create binary target for each problem class
problem_targets = {}
for problem_class in problem_classes:
    # Get class index
    class_idx = list(target_encoders[problem_target].classes_).index(problem_class)
    # Create binary target (1 for this class, 0 for others)
    problem_targets[problem_class] = (y_train_encoded_df[problem_target] == class_idx).astype(int)
    print(f"Created binary target for {problem_class} with {problem_targets[problem_class].sum()} positive examples")

# For each problem class, train a specialized binary classifier
specialist_models = {}
for problem_class in problem_classes:
    print(f"\nTraining specialist model for {problem_class}...")

    # Create massive weight for the positive class (1:100 ratio)
    class_weight = {0: 1, 1: 100}

    # Train a specialized binary classifier
    model = lgb.LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.01,
        max_depth=15,
        num_leaves=127,
        class_weight=class_weight,
        boosting_type='dart',
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        n_jobs=-1
    )

    # Train the model
    model.fit(X_processed, problem_targets[problem_class])

    # Store the model
    specialist_models[problem_class] = model

    # Check training accuracy
    train_preds = model.predict(X_processed)
    train_acc = accuracy_score(problem_targets[problem_class], train_preds)
    print(f"Training accuracy for {problem_class}: {train_acc:.4f}")

# Add these models to our pipeline for potential ensemble use during prediction
pipeline['specialist_models'] = specialist_models

# Modify prediction function to use specialized models when appropriate
original_predict = pipeline['predict']

def ensemble_predict(X):
    # Get base predictions
    base_preds = original_predict(X)

    # Process X
    X_processed = pipeline['preprocessor'].transform(X)

    # For each instance, check if any specialist model gives high confidence
    for i in range(len(X)):
        for problem_class in problem_classes:
            # Get class index
            class_idx = list(target_encoders[problem_target].classes_).index(problem_class)

            # Get specialist model prediction probability
            specialist_prob = specialist_models[problem_class].predict_proba(X_processed[i:i+1])[0][1]

            # If specialist is very confident (>0.85), override the prediction
            if specialist_prob > 0.85:
                base_preds[i, target_columns.index(problem_target)] = class_idx

    return base_preds

# Update pipeline with ensemble prediction
pipeline['predict'] = ensemble_predict
print("Specialist models integrated into prediction pipeline")


--- Training Specialist Models for Problem Classes ---
Created binary target for GERD with 963 positive examples
Created binary target for Migraine with 865 positive examples
Created binary target for Osteoarthritis with 918 positive examples
Created binary target for Common Cold with 913 positive examples

Training specialist model for GERD...
Training accuracy for GERD: 0.9217

Training specialist model for Migraine...
Training accuracy for Migraine: 0.9307

Training specialist model for Osteoarthritis...
Training accuracy for Osteoarthritis: 0.9338

Training specialist model for Common Cold...
Training accuracy for Common Cold: 0.9680
Specialist models integrated into prediction pipeline
