In [9]:
import pandas as pd
df = pd.read_csv('../datasets/datasets/apollo_hospitals/copd_generated_data.csv')

print(df.head())


  LOCATION                  DOA                  DOD   City_type  \
0        A  2020-11-25 10:57:00  2021-01-08 09:44:04  Semi Urban   
1        A  2020-11-28 12:13:00  2021-01-06 15:24:37       Rural   
2        B  2020-12-01 12:55:00  2021-01-03 08:40:45       Urban   
3        A  2020-12-03 10:05:00  2021-01-13 18:36:03       Rural   
4        A  2020-12-06 18:36:00  2021-01-08 02:26:35       Urban   

                                       DIAGNOSISNAME           SPECIALITY  \
0  POORLY CONTROLLED ASTHMA\nRHEUMATOID ARTHRITIS...  INFECTIOUS DISEASES   
1  LEFT LL BRONCHOPNEUMONIA\nBRONCHOMALACIA\nEVEN...    INTERNAL MEDICINE   
2  CA PENISON SPCSEPSIS AKI ON CKD TYPE II DM COP...          PULMONOLOGY   
3  BRONCHIAL ASTHMA\nTYPE II DIABETIC MELLITUS\nL...    INTERNAL MEDICINE   
4  LARGE RECTAL POLYP  RECTAL ADENOCARCINOMA   pT...           CARDIOLOGY   

  CASESPLIT  AGE  GENDER  PATIENT_STATUS  ...      NO    NO2      CO    SO2  \
0   MEDICAL   55       1               1  ...    

In [10]:
# Climate-Health Regression Model with PERCENTAGE PREDICTIONS
# Predicting health outcomes (stroke, cardiac arrest, respiratory failure) from multiple variables

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")


In [11]:
# Step 1: Read and process data with expanded features
apollo_data_path = '../datasets/datasets/apollo_hospitals/copd_generated_data.csv'
apollo_df = pd.read_csv(apollo_data_path)

# Create target variable Y - combination of stroke, cardiac arrest, and respiratory failure
apollo_df['Y_combined_outcome'] = ((apollo_df['Stroke'] == 1) | 
                                   (apollo_df['Cardiac_Arrest'] == 1) | 
                                   (apollo_df['Respiratory_Failure'] == 1)).astype(int)

# Prepare expanded feature set
feature_columns = ['AGE', 'GENDER', 'AQI', 'max_temperature', 
                   'Diabetes', 'Hypertension', 'Chronic_Kidney', 
                   'Liver_Disease', 'COPD', 'Heart_Disease', 'Y_combined_outcome']

model_data = apollo_df[feature_columns].copy().dropna()

print("Model data shape:", model_data.shape)
print(f"Outcome prevalence: {model_data['Y_combined_outcome'].mean()*100:.2f}%")


Model data shape: (70383, 11)
Outcome prevalence: 84.58%


In [12]:
# Step 2: Train model and generate PERCENTAGE predictions
feature_names = ['AGE', 'GENDER', 'AQI', 'max_temperature', 
                 'Diabetes', 'Hypertension', 'Chronic_Kidney', 
                 'Liver_Disease', 'COPD', 'Heart_Disease']

X = model_data[feature_names]
y = model_data['Y_combined_outcome']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train_scaled, y_train)

# Generate PROBABILITY predictions (not binary 0/1)
y_pred_test_proba = logistic_model.predict_proba(X_test_scaled)[:, 1]  # Probability of adverse outcome
y_pred_test_percent = y_pred_test_proba * 100  # Convert to percentage

# Also get binary predictions for evaluation
y_pred_test = logistic_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred_test)
test_auc = roc_auc_score(y_test, y_pred_test_proba)

print("Model training completed!")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test AUC: {test_auc:.4f}")


Model training completed!
Test Accuracy: 0.8458
Test AUC: 0.5321


In [13]:
import statsmodels.api as sm
from statsmodels.formula.api import logit

print("=" * 80)
print("           MODEL ANALYSIS WITH STATISTICAL SIGNIFICANCE")
print("=" * 80)

# Prepare data for statsmodels
model_data_stats = model_data.copy()

# Create formula for logistic regression
feature_formula = ' + '.join(feature_names)
formula = f'Y_combined_outcome ~ {feature_formula}'

print(f"Model Formula: {formula}")
print("\nFitting logistic regression model...")

# Fit logistic regression using statsmodels
logit_model = logit(formula, data=model_data_stats).fit()

# Print the complete statistical summary
print("\n" + "=" * 80)
print("COMPLETE STATISTICAL MODEL SUMMARY:")
print("=" * 80)
print(logit_model.summary())

# Extract significant coefficients only (p < 0.05)
print("\n" + "=" * 80)
print("STATISTICALLY SIGNIFICANT FACTORS ONLY (p < 0.05):")
print("=" * 80)

# Get the results table
results_table = logit_model.summary2().tables[1]
significant_factors = results_table[results_table['P>|z|'] < 0.05]

if len(significant_factors) > 0:
    print(f"\nFound {len(significant_factors)} significant factors:")
    print(significant_factors)
    
    print(f"\nCLINICAL INTERPRETATION:")
    print("-" * 50)
    for var in significant_factors.index:
        if var != 'Intercept':
            coef = significant_factors.loc[var, 'Coef.']
            pval = significant_factors.loc[var, 'P>|z|']
            odds_ratio = np.exp(coef)
            
            if coef > 0:
                effect = "increases"
                risk_change = (odds_ratio - 1) * 100
            else:
                effect = "decreases"
                risk_change = (1 - odds_ratio) * 100
            
            print(f"• {var}: {effect} risk by {risk_change:.1f}% (OR={odds_ratio:.3f}, p={pval:.4f})")
else:
    print("⚠️  No factors are statistically significant at p < 0.05")

print("=" * 80)


           MODEL ANALYSIS WITH STATISTICAL SIGNIFICANCE
Model Formula: Y_combined_outcome ~ AGE + GENDER + AQI + max_temperature + Diabetes + Hypertension + Chronic_Kidney + Liver_Disease + COPD + Heart_Disease

Fitting logistic regression model...
Optimization terminated successfully.
         Current function value: 0.428850
         Iterations 6

COMPLETE STATISTICAL MODEL SUMMARY:
                           Logit Regression Results                           
Dep. Variable:     Y_combined_outcome   No. Observations:                70383
Model:                          Logit   Df Residuals:                    70372
Method:                           MLE   Df Model:                           10
Date:                Sat, 20 Sep 2025   Pseudo R-squ.:                0.002498
Time:                        23:48:27   Log-Likelihood:                -30184.
converged:                       True   LL-Null:                       -30259.
Covariance Type:            nonrobust   LLR p-value:       

In [15]:
# RETRAIN MODEL WITH SIGNIFICANT FEATURES AND SAVE
# RETRAIN MODEL WITH SIGNIFICANT FEATURES AND SAVE
import pickle

# Define significant features and prepare data
significant_features = ['AGE', 'AQI', 'Diabetes', 'Hypertension', 'Heart_Disease']
X_filtered = model_data[significant_features]
y = model_data['Y_combined_outcome']

# Train-test split and scale
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42, stratify=y)
scaler_final = StandardScaler()
X_train_scaled = scaler_final.fit_transform(X_train)
X_test_scaled = scaler_final.transform(X_test)

# Train model
final_model = LogisticRegression(random_state=42, max_iter=1000)
final_model.fit(X_train_scaled, y_train)

# Calculate metrics
y_pred = final_model.predict(X_test_scaled)
y_pred_proba = final_model.predict_proba(X_test_scaled)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

# Save model
model_data_to_save = {'model': final_model, 'scaler': scaler_final, 'features': significant_features}
with open('climate_health_model.pkl', 'wb') as f:
    pickle.dump(model_data_to_save, f)

# Show results
print(f"Accuracy: {accuracy:.4f} | AUC: {auc:.4f} | Features: {len(significant_features)}")
for feature, coef in zip(significant_features, final_model.coef_[0]):
    print(f"{feature}: {coef:.6f} (OR: {np.exp(coef):.6f})")

Accuracy: 0.8458 | AUC: 0.5359 | Features: 5
AGE: 0.053708 (OR: 1.055176)
AQI: 0.018208 (OR: 1.018375)
Diabetes: 0.032392 (OR: 1.032923)
Hypertension: 0.090924 (OR: 1.095185)
Heart_Disease: 0.038333 (OR: 1.039077)


In [None]:
# Function to predict risk percentage for new patients
def predict_risk_percentage(age, aqi, diabetes, hypertension, 
                        heart_disease):
    """
    Predict risk percentage for a new patient
    
    Parameters:
    - age: patient age
    - gender: 0 for female, 1 for male
    - aqi: air quality index
    - max_temp: maximum temperature
    - diabetes: 0 for no, 1 for yes
    - hypertension: 0 for no, 1 for yes
    - chronic_kidney: 0 for no, 1 for yes
    - liver_disease: 0 for no, 1 for yes
    - copd: 0 for no, 1 for yes
    - heart_disease: 0 for no, 1 for yes
    
    Returns:
    - risk_percentage: predicted risk as percentage (0-100%)
    """
    # Create input array
    input_data = np.array([[age, gender, aqi, max_temp, diabetes, hypertension, 
                           chronic_kidney, liver_disease, copd, heart_disease]])
    
    # Scale the input
    input_scaled = scaler.transform(input_data)
    
    # Predict probability and convert to percentage
    risk_probability = logistic_model.predict_proba(input_scaled)[0, 1]
    risk_percentage = risk_probability * 100
    
    return risk_percentage

# Example predictions
print("EXAMPLE RISK PREDICTIONS (as percentages):")
print("=" * 60)

# Example 1: Young healthy person, good air quality
risk1 = predict_risk_percentage(age=25, gender=0, aqi=50, max_temp=25, 
                               diabetes=0, hypertension=0, chronic_kidney=0, 
                               liver_disease=0, copd=0, heart_disease=0)
print(f"Young healthy person, good air quality: {risk1:.2f}%")

# Example 2: Elderly person with multiple conditions, poor air quality
risk2 = predict_risk_percentage(age=75, gender=1, aqi=200, max_temp=35, 
                               diabetes=1, hypertension=1, chronic_kidney=1, 
                               liver_disease=0, copd=1, heart_disease=1)
print(f"Elderly person with multiple conditions, poor air quality: {risk2:.2f}%")

# Example 3: Middle-aged person with moderate risk factors
risk3 = predict_risk_percentage(age=50, gender=1, aqi=100, max_temp=30, 
                               diabetes=1, hypertension=0, chronic_kidney=0, 
                               liver_disease=0, copd=0, heart_disease=0)
print(f"Middle-aged person with diabetes, moderate air quality: {risk3:.2f}%")

print("=" * 60)
print("MODEL NOW OUTPUTS PERCENTAGE RISK INSTEAD OF BINARY 0/1!")
print("=" * 60)


EXAMPLE RISK PREDICTIONS (as percentages):
Young healthy person, good air quality: 80.76%
Elderly person with multiple conditions, poor air quality: 88.38%
Middle-aged person with diabetes, moderate air quality: 83.77%
MODEL NOW OUTPUTS PERCENTAGE RISK INSTEAD OF BINARY 0/1!




In [1]:
import pandas as pd
df = pd.read_csv('../datasets/datasets/apollo_hospitals/copd_generated_data.csv')

print(df.head())


  LOCATION                  DOA                  DOD   City type  \
0        A  2020-11-25 10:57:00  2021-01-08 09:44:04  Semi Urban   
1        A  2020-11-28 12:13:00  2021-01-06 15:24:37       Rural   
2        B  2020-12-01 12:55:00  2021-01-03 08:40:45       Urban   
3        A  2020-12-03 10:05:00  2021-01-13 18:36:03       Rural   
4        A  2020-12-06 18:36:00  2021-01-08 02:26:35       Urban   

                                       DIAGNOSISNAME           SPECIALITY  \
0  POORLY CONTROLLED ASTHMA\nRHEUMATOID ARTHRITIS...  INFECTIOUS DISEASES   
1  LEFT LL BRONCHOPNEUMONIA\nBRONCHOMALACIA\nEVEN...    INTERNAL MEDICINE   
2  CA PENISON SPCSEPSIS AKI ON CKD TYPE II DM COP...          PULMONOLOGY   
3  BRONCHIAL ASTHMA\nTYPE II DIABETIC MELLITUS\nL...    INTERNAL MEDICINE   
4  LARGE RECTAL POLYP  RECTAL ADENOCARCINOMA   pT...           CARDIOLOGY   

  CASESPLIT  AGE  GENDER  PATIENT_STATUS  ...      NO    NO2      CO    SO2  \
0   MEDICAL   55       1               1  ...    

In [2]:
# Climate-Health Regression Model
# Predicting health outcomes (stroke, cardiac arrest, respiratory failure) from age, gender, and AQI

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")


ModuleNotFoundError: No module named 'sklearn'

In [3]:
# Step 1: Read and examine the Apollo Hospitals COPD data
apollo_data_path = '../datasets/datasets/apollo_hospitals/copd_generated_data.csv'
apollo_df = pd.read_csv(apollo_data_path)

print("Apollo Hospitals Data Shape:", apollo_df.shape)
print("\nColumn names:")
print(apollo_df.columns.tolist())
print("\nFirst few rows:")
print(apollo_df.head())


Apollo Hospitals Data Shape: (70383, 52)

Column names:
['LOCATION', 'DOA', 'DOD', 'City type', 'DIAGNOSISNAME', 'SPECIALITY', 'CASESPLIT', 'AGE', 'GENDER', 'PATIENT_STATUS', 'Unnamed: 10', 'Fever', 'Cough', 'Weakness', 'Diabetes', 'Hypertension', 'Chronic_Kidney', 'Liver_Disease', 'COPD', 'Heart_Disease', 'Unnamed: 20', 'AQI', 'Stroke', 'Cardiac_Arrest', 'Respiratory_Failure', 'Hospitalization', 'PM2.5', 'PM10', 'AST', 'CRP', 'Creatinine', 'Eosin', 'Ferritin', 'INR', 'LDH', 'Lymphocyte', 'RDW', 'LOS', 'WARD_TYPE', 'Unnamed: 39', 'AQI.1', 'Unnamed: 41', 'NO', 'NO2', 'CO', 'SO2', 'O3', 'mean temperature', 'max temperature', 'min temperature', 'humidity', 'Unnamed: 51']

First few rows:
  LOCATION                  DOA                  DOD   City type  \
0        A  2020-11-25 10:57:00  2021-01-08 09:44:04  Semi Urban   
1        A  2020-11-28 12:13:00  2021-01-06 15:24:37       Rural   
2        B  2020-12-01 12:55:00  2021-01-03 08:40:45       Urban   
3        A  2020-12-03 10:05:00  2

In [4]:
# Step 2: Clean and prepare the data
# Check for missing values and data types
print("Missing values per column:")
print(apollo_df.isnull().sum())

print("\nData types:")
print(apollo_df.dtypes)

print("\nUnique values in key columns:")
print("GENDER unique values:", apollo_df['GENDER'].unique())
print("Stroke unique values:", apollo_df['Stroke'].unique())
print("Cardiac_Arrest unique values:", apollo_df['Cardiac_Arrest'].unique())
print("Respiratory_Failure unique values:", apollo_df['Respiratory_Failure'].unique())


Missing values per column:
LOCATION                   0
DOA                        0
DOD                        0
City type                  0
DIAGNOSISNAME            278
SPECIALITY                 0
CASESPLIT                  0
AGE                        0
GENDER                     0
PATIENT_STATUS             0
Unnamed: 10            70383
Fever                      0
Cough                      0
Weakness                   0
Diabetes                   0
Hypertension               0
Chronic_Kidney             0
Liver_Disease              0
COPD                       0
Heart_Disease              0
Unnamed: 20            70383
AQI                        0
Stroke                     0
Cardiac_Arrest             0
Respiratory_Failure        0
Hospitalization            0
PM2.5                      0
PM10                       0
AST                        0
CRP                        0
Creatinine                 0
Eosin                      0
Ferritin                   0
INR             

In [5]:
# Step 3: Create target variable Y - combination of stroke, cardiac arrest, and respiratory failure
# Y = 1 if any of the three conditions is present, 0 otherwise
apollo_df['Y_combined_outcome'] = ((apollo_df['Stroke'] == 1) | 
                                   (apollo_df['Cardiac_Arrest'] == 1) | 
                                   (apollo_df['Respiratory_Failure'] == 1)).astype(int)

print("Distribution of combined outcome (Y):")
print(apollo_df['Y_combined_outcome'].value_counts())
print(f"\nPercentage with adverse outcome: {apollo_df['Y_combined_outcome'].mean()*100:.2f}%")

# Check individual condition distributions
print("\nIndividual condition distributions:")
print("Stroke:", apollo_df['Stroke'].value_counts())
print("Cardiac Arrest:", apollo_df['Cardiac_Arrest'].value_counts())
print("Respiratory Failure:", apollo_df['Respiratory_Failure'].value_counts())


Distribution of combined outcome (Y):
Y_combined_outcome
1    59530
0    10853
Name: count, dtype: int64

Percentage with adverse outcome: 84.58%

Individual condition distributions:
Stroke: Stroke
0    61931
1     8452
Name: count, dtype: int64
Cardiac Arrest: Cardiac_Arrest
0    67352
1     3031
Name: count, dtype: int64
Respiratory Failure: Respiratory_Failure
1    57707
0    12676
Name: count, dtype: int64


In [None]:
# Step 4: Prepare X variables (age, gender, AQI, max temperature, and medical conditions)
# Create a clean dataset with our expanded features and target
feature_columns = ['AGE', 'GENDER', 'AQI', 'max temperature', 
                   'Diabetes', 'Hypertension', 'Chronic_Kidney', 
                   'Liver_Disease', 'COPD', 'Heart_Disease', 'Y_combined_outcome']

model_data = apollo_df[feature_columns].copy()

# Remove any rows with missing values in our key variables
model_data = model_data.dropna()

print("Model data shape after cleaning:", model_data.shape)
print("\nSummary statistics:")
print(model_data.describe())

# Check data distribution for original variables
print("\nAge distribution:")
print(f"Age range: {model_data['AGE'].min()} - {model_data['AGE'].max()}")
print(f"Mean age: {model_data['AGE'].mean():.1f}")

print("\nGender distribution:")
print(model_data['GENDER'].value_counts())

print("\nAQI distribution:")
print(f"AQI range: {model_data['AQI'].min()} - {model_data['AQI'].max()}")
print(f"Mean AQI: {model_data['AQI'].mean():.1f}")

# Check new variables
print("\nMax Temperature distribution:")
print(f"Temperature range: {model_data['max temperature'].min()} - {model_data['max temperature'].max()}")
print(f"Mean max temperature: {model_data['max temperature'].mean():.1f}")

print("\nMedical conditions prevalence:")
medical_conditions = ['Diabetes', 'Hypertension', 'Chronic_Kidney', 'Liver_Disease', 'COPD', 'Heart_Disease']
for condition in medical_conditions:
    prevalence = model_data[condition].mean() * 100
    print(f"{condition}: {prevalence:.1f}%")


Model data shape after cleaning: (70383, 4)

Summary statistics:
                AGE        GENDER           AQI  Y_combined_outcome
count  70383.000000  70383.000000  70383.000000        70383.000000
mean      64.004518      0.558189    192.271053            0.845801
std       16.442539      0.496606    196.542329            0.361142
min        2.000000      0.000000      0.000000            0.000000
25%       53.000000      0.000000      5.630000            1.000000
50%       65.000000      1.000000    102.260000            1.000000
75%       76.000000      1.000000    403.905000            1.000000
max      102.000000      1.000000    500.000000            1.000000

Age distribution:
Age range: 2 - 102
Mean age: 64.0

Gender distribution:
GENDER
1    39287
0    31096
Name: count, dtype: int64

AQI distribution:
AQI range: 0.0 - 500.0
Mean AQI: 192.3


In [7]:
# Step 5: Data visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Age distribution by outcome
axes[0,0].hist([model_data[model_data['Y_combined_outcome']==0]['AGE'], 
                model_data[model_data['Y_combined_outcome']==1]['AGE']], 
               bins=30, alpha=0.7, label=['No adverse outcome', 'Adverse outcome'])
axes[0,0].set_xlabel('Age')
axes[0,0].set_ylabel('Frequency')
axes[0,0].set_title('Age Distribution by Outcome')
axes[0,0].legend()

# Gender distribution by outcome
gender_outcome = pd.crosstab(model_data['GENDER'], model_data['Y_combined_outcome'])
gender_outcome.plot(kind='bar', ax=axes[0,1])
axes[0,1].set_xlabel('Gender (0=Female, 1=Male)')
axes[0,1].set_ylabel('Count')
axes[0,1].set_title('Gender Distribution by Outcome')
axes[0,1].legend(['No adverse outcome', 'Adverse outcome'])

# AQI distribution by outcome
axes[1,0].hist([model_data[model_data['Y_combined_outcome']==0]['AQI'], 
                model_data[model_data['Y_combined_outcome']==1]['AQI']], 
               bins=30, alpha=0.7, label=['No adverse outcome', 'Adverse outcome'])
axes[1,0].set_xlabel('AQI')
axes[1,0].set_ylabel('Frequency')
axes[1,0].set_title('AQI Distribution by Outcome')
axes[1,0].legend()

# Correlation heatmap
correlation_matrix = model_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1,1])
axes[1,1].set_title('Correlation Matrix')

plt.tight_layout()
plt.show()


NameError: name 'plt' is not defined

In [None]:
# Step 6: Prepare data for modeling
# Separate features (X) and target (y) - now with expanded feature set
feature_names = ['AGE', 'GENDER', 'AQI', 'max temperature', 
                 'Diabetes', 'Hypertension', 'Chronic_Kidney', 
                 'Liver_Disease', 'COPD', 'Heart_Disease']

X = model_data[feature_names]
y = model_data['Y_combined_outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print(f"Number of features: {len(feature_names)}")
print("Features:", feature_names)
print("\nTraining set outcome distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest set outcome distribution:")
print(y_test.value_counts(normalize=True))

# Scale the features for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeature scaling completed")


In [None]:
# Step 7: Build and train the regression model
# Using logistic regression since we have a binary outcome
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_train = logistic_model.predict(X_train_scaled)
y_pred_test = logistic_model.predict(X_test_scaled)
y_pred_proba_test = logistic_model.predict_proba(X_test_scaled)[:, 1]

print("Model training completed!")
print("\nModel coefficients:")
for feature, coef in zip(feature_names, logistic_model.coef_[0]):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {logistic_model.intercept_[0]:.4f}")

# Calculate odds ratios (exponentiated coefficients)
print("\nOdds Ratios (how much each variable increases odds of adverse outcome):")
for feature, coef in zip(feature_names, logistic_model.coef_[0]):
    odds_ratio = np.exp(coef)
    print(f"{feature}: {odds_ratio:.4f}")


In [None]:
# Step 8: Evaluate model performance
# Training accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Test accuracy
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Detailed classification report
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# Confusion matrix
print("\nConfusion Matrix (Test Set):")
cm = confusion_matrix(y_test, y_pred_test)
print(cm)

# Create a more detailed confusion matrix visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Adverse Outcome', 'Adverse Outcome'],
            yticklabels=['No Adverse Outcome', 'Adverse Outcome'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


In [None]:
# Step 9: Feature importance and model interpretation
# Feature importance based on absolute coefficient values
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': logistic_model.coef_[0],
    'Abs_Coefficient': np.abs(logistic_model.coef_[0]),
    'Odds_Ratio': np.exp(logistic_model.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)

print("Feature Importance (sorted by absolute coefficient value):")
print(feature_importance)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.bar(feature_importance['Feature'], feature_importance['Coefficient'])
plt.title('Feature Coefficients')
plt.ylabel('Coefficient Value')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
plt.bar(feature_importance['Feature'], feature_importance['Odds_Ratio'])
plt.title('Odds Ratios')
plt.ylabel('Odds Ratio')
plt.xticks(rotation=45)
plt.axhline(y=1, color='red', linestyle='--', alpha=0.7, label='No effect')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Step 10: Model summary and insights
print("=" * 60)
print("CLIMATE-HEALTH REGRESSION MODEL SUMMARY")
print("=" * 60)

print(f"\nDataset Information:")
print(f"- Total samples: {len(model_data):,}")
print(f"- Features: Age, Gender, Air Quality Index (AQI)")
print(f"- Target: Combined outcome (Stroke OR Cardiac Arrest OR Respiratory Failure)")
print(f"- Prevalence of adverse outcomes: {model_data['Y_combined_outcome'].mean()*100:.2f}%")

print(f"\nModel Performance:")
print(f"- Training Accuracy: {train_accuracy:.3f}")
print(f"- Test Accuracy: {test_accuracy:.3f}")

print(f"\nKey Findings:")
for idx, row in feature_importance.iterrows():
    feature = row['Feature']
    coef = row['Coefficient']
    odds_ratio = row['Odds_Ratio']
    
    if coef > 0:
        direction = "increases"
    else:
        direction = "decreases"
    
    print(f"- {feature}: {direction} risk by {abs(coef):.3f} units (odds ratio: {odds_ratio:.3f})")

print(f"\nModel Equation (log-odds):")
print(f"log(odds) = {logistic_model.intercept_[0]:.3f} + {logistic_model.coef_[0][0]:.3f}*AGE + {logistic_model.coef_[0][1]:.3f}*GENDER + {logistic_model.coef_[0][2]:.3f}*AQI")

print(f"\nInterpretation:")
if logistic_model.coef_[0][0] > 0:
    print("- Older age is associated with higher risk of adverse health outcomes")
else:
    print("- Older age is associated with lower risk of adverse health outcomes")

if logistic_model.coef_[0][1] > 0:
    print("- Male gender is associated with higher risk")
else:
    print("- Female gender is associated with higher risk")

if logistic_model.coef_[0][2] > 0:
    print("- Higher AQI (worse air quality) is associated with higher risk")
else:
    print("- Higher AQI (worse air quality) is associated with lower risk")


In [None]:
# FINAL MODEL SCORES TABLE - Comprehensive Results
print("=" * 80)
print("                    FINAL MODEL SCORES AND COEFFICIENTS")
print("=" * 80)

# Create a comprehensive results dataframe
results_df = pd.DataFrame({
    'Variable': feature_names,
    'Coefficient': logistic_model.coef_[0],
    'Odds_Ratio': np.exp(logistic_model.coef_[0]),
    'Abs_Coefficient': np.abs(logistic_model.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)

# Add significance interpretation
results_df['Effect_Direction'] = results_df['Coefficient'].apply(
    lambda x: 'Increases Risk' if x > 0 else 'Decreases Risk'
)

results_df['Risk_Change_Percent'] = ((results_df['Odds_Ratio'] - 1) * 100).round(2)

print("\nFINAL MODEL COEFFICIENTS AND ODDS RATIOS:")
print("-" * 80)
print(f"{'Variable':<20} {'Coefficient':<12} {'Odds Ratio':<12} {'Risk Change':<15} {'Effect':<15}")
print("-" * 80)

for _, row in results_df.iterrows():
    print(f"{row['Variable']:<20} {row['Coefficient']:<12.4f} {row['Odds_Ratio']:<12.4f} "
          f"{row['Risk_Change_Percent']:>6.1f}%{'':<7} {row['Effect_Direction']:<15}")

print("-" * 80)
print(f"{'Intercept':<20} {logistic_model.intercept_[0]:<12.4f}")
print("=" * 80)

# Model Performance Summary
print(f"\nMODEL PERFORMANCE SUMMARY:")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Number of Features: {len(feature_names)}")
print(f"Sample Size: {len(model_data):,}")

# Top 3 most important features
print(f"\nTOP 3 MOST IMPORTANT FEATURES (by absolute coefficient):")
top_features = results_df.head(3)
for i, (_, row) in enumerate(top_features.iterrows(), 1):
    print(f"{i}. {row['Variable']}: {row['Effect_Direction']} (OR: {row['Odds_Ratio']:.3f})")

print("=" * 80)


In [None]:
columns = 

'/Users/nikkihu/Documents/Github/climate-hackathon-2025/data-exploration-jiwon'