# Deepsurv


In [1]:
!pip install -U sentence-transformers > /dev/null 2>&1
!pip install xgboost > /dev/null 2>&1
!pip install scikit-learn==1.4.2 scikit-survival==0.23.1 > /dev/null 2>&1
!pip install torchtuples > /dev/null 2>&1
!pip install pycox > /dev/null 2>&1
!pip install numpy==1.21.5  > /dev/null 2>&1
!pip install interpret-core  > /dev/null 2>&1
!pip install lightgbm > /dev/null 2>&1
!pip install shap > /dev/null 2>&1
!pip install lifelines pycox > /dev/null 2>&1
!pip install pycountry > /dev/null 2>&1
!pip install -U sentence-transformers xgboost scikit-learn==1.4.2 scikit-survival==0.23.1 torchtuples pycox numpy==1.21.5 interpret-core lightgbm shap lifelines pycox pycountry > /dev/null 2>&1


In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torchtuples as tt
import kagglehub
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from lifelines import CoxPHFitter
from sksurv.metrics import concordance_index_censored
from pycox.models.cox import CoxPH
from pycox.evaluation import EvalSurv
import torch
import torch.nn as nn
from torch.optim import Adam


In [7]:
# Life Expectancy dataset
life_exp_path = kagglehub.dataset_download("kumarajarshi/life-expectancy-who")
life_exp_file = os.path.join(life_exp_path, "Life Expectancy Data.csv")
life_exp_df = pd.read_csv(life_exp_file)
print("Life Expectancy Sample:")
print(life_exp_df.head())

# Heart Failure dataset (not used in LightGBM, but kept for context)
heart_path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")
heart_file = os.path.join(heart_path, "heart.csv")
heart_df = pd.read_csv(heart_file)
print("Heart Failure Sample:")
print(heart_df.head())

# Age Dataset
age_path = kagglehub.dataset_download("imoore/age-dataset")
age_file = os.path.join(age_path, "AgeDataset-V1.csv")
age_df = pd.read_csv(age_file)
print("Age Dataset Sample:")
print(age_df.head())

# World important events Dataset
events_path = kagglehub.dataset_download("saketk511/world-important-events-ancient-to-modern")
events_file = os.path.join(events_path, "World Important Dates.csv")
events_df = pd.read_csv(events_file)
print("World Important Events Sample:")
print(events_df.head())

# Plane Crash Dataset
plane_crash_path = kagglehub.dataset_download("nguyenhoc/plane-crash")
plane_crash_file = os.path.join(plane_crash_path, "planecrashinfo_20181121001952.csv")  
planes_df = pd.read_csv(plane_crash_file)
print("Historical Plane Crashes Sample:")
print(planes_df.head())

# Gloabl Life Expectancy dataset
global_le_path = kagglehub.dataset_download("hasibalmuzdadid/global-life-expectancy-historical-dataset")
global_le_file = os.path.join(global_le_path, "global life expectancy dataset.csv")
global_le_df = pd.read_csv(global_le_file)
print("Global Life Expectancy Historical Dataset Sample:")
print(global_le_df.head())

# US death rate Dataset
death_rates_path = kagglehub.dataset_download("melissamonfared/death-rates-united-states")
death_rates_file = os.path.join(death_rates_path, "Death_rates.csv")
death_rates_df = pd.read_csv(death_rates_file)
print("Death Rates United States Dataset Sample:")
print(death_rates_df.head())

Life Expectancy Sample:
       Country  Year      Status  Life expectancy   Adult Mortality  \
0  Afghanistan  2015  Developing              65.0            263.0   
1  Afghanistan  2014  Developing              59.9            271.0   
2  Afghanistan  2013  Developing              59.9            268.0   
3  Afghanistan  2012  Developing              59.5            272.0   
4  Afghanistan  2011  Developing              59.2            275.0   

   infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   ...  \
0             62     0.01               71.279624         65.0      1154  ...   
1             64     0.01               73.523582         62.0       492  ...   
2             66     0.01               73.219243         64.0       430  ...   
3             69     0.01               78.184215         67.0      2787  ...   
4             71     0.01                7.097109         68.0      3013  ...   

   Polio  Total expenditure  Diphtheria    HIV/AIDS         GD

In [8]:
def enhanced_feature_engineering(df, life_exp_df, global_le_df, death_rates_df):
    """
    Feature engineering for survival analysis with proper censoring handling
    """
    # -------- Validate Input Columns --------
    required_columns = {
        'Country', 'Gender', 'Occupation', 'Birth year',
        'Death year', 'Age of death'
    }
    missing = required_columns - set(df.columns)
    if missing:
        raise KeyError(f"Missing required columns: {missing}")

    # -------- Set Observation Year --------
    current_year = 2019  # Single source of truth for current year
    
    # -------- Basic Cleaning --------
    df['Country'] = df['Country'].str.split(';').str[0].str.strip()
    df['Gender'] = np.where(df['Gender'] == 'Male', 1, 
                          np.where(df['Gender'] == 'Female', 0, 0.5))

    # -------- Clinical Features --------
    stress_map = {'Politician': 9, 'Military personnel': 8, 'Journalist': 7,
                  'Businessperson': 6, 'Artist': 5, 'Teacher': 4, 
                  'Researcher': 3, 'Other': 5, 'Unknown': 5}
    df['stress_score'] = df['Occupation'].map(stress_map).fillna(5).astype('float32') / 9.0

    life_exp_df[' BMI '] = pd.to_numeric(life_exp_df[' BMI '], errors='coerce')
    country_bmi = life_exp_df.groupby('Country')[' BMI '].median().to_dict()
    df['avg_bmi'] = df['Country'].map(country_bmi).fillna(25).astype('float32')

    df['smoking_prev'] = (1 / (1 + np.exp((df['Birth year'] - 1950) / 10))).astype('float32')
    df['smoking_prev'] = np.clip(df['smoking_prev'], 0.1, 0.6)

    # -------- Country-Level Features --------
    global_le_df.columns = global_le_df.columns.str.strip()
    global_le_melted = global_le_df.melt(
        id_vars=['Country Name', 'Country Code'],
        value_vars=[str(y) for y in range(1960, current_year+1)],
        var_name='Year',
        value_name='Life_Exp_Value'
    )
    
    global_le_agg = (
        global_le_melted
        .sort_values(['Country Name', 'Year'], ascending=[True, False])
        .groupby('Country Name')
        ['Life_Exp_Value']
        .first()
        .reset_index()
        .rename(columns={'Country Name': 'Country'})
    )
    
    df = df.merge(global_le_agg, on='Country', how='left')
    df['global_life_exp'] = df['Life_Exp_Value'].fillna(df['Life_Exp_Value'].median())

    # -------- Survival Data Setup --------
    df['censored'] = (df['Death year'] > current_year).astype(int)
    df['T'] = np.where(
        df['censored'] == 1,
        current_year - df['Birth year'],
        df['Age of death']
    ).clip(0, 120)

    return df


In [9]:
class DeepSurv(nn.Module):
    def __init__(self, input_dim):
        super(DeepSurv, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.model(x)


In [10]:
def train_deepsurv_model(df):
    """
    Train DeepSurv Model for survival analysis
    """
    features = ['stress_score', 'avg_bmi', 'smoking_prev', 'global_life_exp']
    X = df[features].fillna(0).values
    T = df['T'].values
    E = df['censored'].values

    # Normalize Features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Convert to Tensors
    X_train, X_test, T_train, T_test, E_train, E_test = train_test_split(X, T, E, test_size=0.2, random_state=42)
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    T_train = torch.tensor(T_train, dtype=torch.float32)
    T_test = torch.tensor(T_test, dtype=torch.float32)
    E_train = torch.tensor(E_train, dtype=torch.float32)
    E_test = torch.tensor(E_test, dtype=torch.float32)

    # Model Initialization
    input_dim = X_train.shape[1]
    model = DeepSurv(input_dim)
    optimizer = Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    # Training Loop
    epochs = 50
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        predictions = model(X_train).squeeze()
        loss = loss_fn(predictions, T_train)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

    return model, scaler


In [11]:
def evaluate_deepsurv_model(model, scaler, df):
    """
    Evaluate DeepSurv Model Performance
    """
    features = ['stress_score', 'avg_bmi', 'smoking_prev', 'global_life_exp']
    X_test = df[features].fillna(0).values
    X_test = scaler.transform(X_test)
    X_test = torch.tensor(X_test, dtype=torch.float32)

    model.eval()
    predictions = model(X_test).detach().numpy().flatten()

    print("\n📊 DeepSurv Model Predictions:")
    print(predictions[:10])


In [12]:
# 1️⃣ Process the Dataset
processed_batch = enhanced_feature_engineering(age_df, life_exp_df, global_le_df, death_rates_df)

# 2️⃣ Train DeepSurv
print("\n🚀 Training DeepSurv Model...")
deepsurv_model, scaler = train_deepsurv_model(processed_batch)

# 3️⃣ Evaluate Model
evaluate_deepsurv_model(deepsurv_model, scaler, processed_batch)



🚀 Training DeepSurv Model...
Epoch 1/50, Loss: nan
Epoch 11/50, Loss: nan
Epoch 21/50, Loss: nan
Epoch 31/50, Loss: nan
Epoch 41/50, Loss: nan

📊 DeepSurv Model Predictions:
[nan nan nan nan nan nan nan nan nan nan]
