In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from lifelines import CoxPHFitter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data
data = pd.read_csv('synthetic_data.csv')

# Inspect data
print(data.info())
print(data.describe())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57592 entries, 0 to 57591
Data columns (total 29 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Patient ID                       57592 non-null  object
 1   Date of birth                    57592 non-null  object
 2   Gender                           57592 non-null  object
 3   Date confirmed HIV positive      57592 non-null  object
 4   Previous ART exposure            57592 non-null  object
 5   Current ART regimen              57592 non-null  object
 6   Start date of current ART        57592 non-null  object
 7   Date of first visit              57592 non-null  object
 8   Date of most recent visit        57592 non-null  object
 9   CD4 count at first visit         57592 non-null  int64 
 10  Viral load at first visit        57592 non-null  int64 
 11  CD4 count at most recent visit   57592 non-null  int64 
 12  Viral load at most recent visit 

In [3]:
# Calculate risk score
def calculate_risk_score(df):
    def assign_risk(row):
        # Age at First Visit
        if row['Age at first visit'] < 30:
            age_risk = 0
        elif 30 <= row['Age at first visit'] < 40:
            age_risk = 1
        elif 40 <= row['Age at first visit'] < 50:
            age_risk = 2
        elif 50 <= row['Age at first visit'] < 60:
            age_risk = 3
        elif 60 <= row['Age at first visit'] < 70:
            age_risk = 4
        else:
            age_risk = 5

        # CD4 Count at Most Recent Visit
        if row['CD4 count at most recent visit'] > 500:
            cd4_risk = 0
        elif 350 <= row['CD4 count at most recent visit'] <= 499:
            cd4_risk = 1
        elif 200 <= row['CD4 count at most recent visit'] <= 349:
            cd4_risk = 3
        else:
            cd4_risk = 5

        # Viral Load at Most Recent Visit
        if row['Viral load at most recent visit'] < 20:
            viral_load_risk = 0
        elif 20 <= row['Viral load at most recent visit'] <= 1000:
            viral_load_risk = 2
        else:
            viral_load_risk = 5

        # Adherence to ART
        adherence_risk_map = {
            'Always': 0,
            'Often': 1,
            'Sometimes': 3,
            'Rarely': 4,
            'Never': 5
        }
        adherence_risk = adherence_risk_map.get(row['Adherence to ART'], 0)

        # Comorbidities
        comorbidities_risk_map = {
            'None': 0,
            'Diabetes': 2,
            'Hypertension': 2,
            'Tuberculosis': 2,
            'Hepatitis B/C': 2,
            'Cardiovascular diseases': 2,
            'Other': 2
        }
        
        # Handle NaN values in comorbidities
        if pd.isna(row['Comorbidities']):
            comorbidities_risk = 0
        else:
            comorbidities_risk = sum([comorbidities_risk_map.get(comorbidity, 2) for comorbidity in row['Comorbidities'].split(', ')])

        if comorbidities_risk > 2:
            comorbidities_risk = 5

        # Substance Use History
        substance_use_risk_map = {
            'None': 0,
            'Alcohol': 3,
            'Tobacco': 3,
            'Illicit drugs': 5,
            'Other': 5
        }
        substance_use_risk = substance_use_risk_map.get(row['Substance use history'], 0)

        # Calculate total risk score
        total_risk_score = age_risk + cd4_risk + viral_load_risk + adherence_risk + comorbidities_risk + substance_use_risk

        return total_risk_score

    # Apply the function to each row
    df['Risk Score'] = df.apply(assign_risk, axis=1)
    return df

In [4]:
data = calculate_risk_score(data)

In [5]:
# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())

# Fill missing values where appropriate
data['Comorbidities'].fillna('None', inplace=True)
data['Substance use history'].fillna('None', inplace=True)
data['Reported symptoms'].fillna('None', inplace=True)
data['Physical activity'].fillna('None', inplace=True)
# Convert date columns to datetime format
date_columns = [
    'Date of birth', 'Date confirmed HIV positive', 'Start date of current ART',
    'Date of first visit', 'Date of most recent visit', 'Date of exit from the study'
]
for col in date_columns:
    data[col] = pd.to_datetime(data[col])

# Check data types
print("\nData types after conversion:")
print(data.dtypes)


Missing values in each column:
Patient ID                             0
Date of birth                          0
Gender                                 0
Date confirmed HIV positive            0
Previous ART exposure                  0
Current ART regimen                    0
Start date of current ART              0
Date of first visit                    0
Date of most recent visit              0
CD4 count at first visit               0
Viral load at first visit              0
CD4 count at most recent visit         0
Viral load at most recent visit        0
Age at first visit                     0
Employment status                      0
Education level                        0
Income level                           0
Marital status                         0
Substance use history              11552
Comorbidities                       8194
Reported symptoms                   9604
Dietary habits                         0
Physical activity                  19037
Adherence to ART         

In [6]:
data.isnull().sum()

Patient ID                         0
Date of birth                      0
Gender                             0
Date confirmed HIV positive        0
Previous ART exposure              0
Current ART regimen                0
Start date of current ART          0
Date of first visit                0
Date of most recent visit          0
CD4 count at first visit           0
Viral load at first visit          0
CD4 count at most recent visit     0
Viral load at most recent visit    0
Age at first visit                 0
Employment status                  0
Education level                    0
Income level                       0
Marital status                     0
Substance use history              0
Comorbidities                      0
Reported symptoms                  0
Dietary habits                     0
Physical activity                  0
Adherence to ART                   0
Missed doses in the last month     0
Adverse event                      0
Duration of followups              0
D

In [7]:
data = data.drop(['Patient ID'], axis=1)

In [8]:
data['DurationToExitDate'] = (pd.to_datetime(data['Date of exit from the study']) - pd.to_datetime(data['Start date of current ART'])).dt.days
data['DurationHIVPositive'] = (pd.to_datetime(data['Date of most recent visit']) - pd.to_datetime(data['Date confirmed HIV positive'])).dt.days

In [9]:
#Drop Date Columns
data = data.drop(['Date of birth', 'Date confirmed HIV positive', 'Start date of current ART', 'Date of first visit', 'Date of most recent visit', 'Date of exit from the study'], axis=1)

In [10]:
#Remove negative values
def convert_to_zero(df, column_name):
    """
    This function converts all negative values in a guven column to zero.

    Args:
        df (pandas.DataFrame): The dataframe containing the column to be processed.
        column_name (str): The name of the column to be processed.

    Returns:
        pandas.DataFrame: The modified dataframe with negative values converted to zero.
    """
    df[column_name] = df[column_name].clip(lower=0)

    return df

In [11]:
data = convert_to_zero(data, 'DurationToExitDate')
data = convert_to_zero(data, 'DurationHIVPositive')

In [12]:
# Normalize the Risk Score to a range of 0 to 100
data['Risk Score'] = data['Risk Score'].apply(lambda x: (x - data['Risk Score'].min()) / (data['Risk Score'].max() - data['Risk Score'].min()) * 100)

In [13]:
# Define risk category thresholds
low_risk_threshold = 33.33
medium_risk_threshold = 66.67

In [14]:
# Create a new column for risk categories
def categorize_risk(score):
    if score <= low_risk_threshold:
        return 'low risk'
    elif score <= medium_risk_threshold:
        return 'medium risk'
    else:
        return 'high risk'

data['Risk Category'] = data['Risk Score'].apply(categorize_risk)

In [15]:
from sklearn.preprocessing import LabelEncoder

# Encoding risk categories
# Define the mapping explicitly
category_mapping = {'low risk': 0, 'medium risk': 1, 'high risk': 2}

# Create the label encoder
label_encoder = LabelEncoder()

# Fit the encoder on the unique categories (optional, but ensures all categories are known)
label_encoder.fit(list(category_mapping.keys()))

# Transform the risk category column using the defined mapping
data['Risk Category'] = data['Risk Category'].map(category_mapping)

In [16]:
# Remove rows where the target variable is NaN
data = data.dropna(subset=['Risk Score'])

#Remove duplicates
data = data.drop_duplicates()

In [17]:
data

Unnamed: 0,Gender,Previous ART exposure,Current ART regimen,CD4 count at first visit,Viral load at first visit,CD4 count at most recent visit,Viral load at most recent visit,Age at first visit,Employment status,Education level,...,Physical activity,Adherence to ART,Missed doses in the last month,Adverse event,Duration of followups,Reason for exit,Risk Score,DurationToExitDate,DurationHIVPositive,Risk Category
0,Male,No,Regimen C,372,184642,228,187361,48,Student,Other,...,Regular,Rarely,30,Yes,5461,Other,73.913043,595,0,2
1,Female,Yes,Regimen A,1430,166189,852,181030,23,Unemployed,Secondary education,...,,Sometimes,16,No,693,Died,56.521739,309,903,1
2,Male,No,Regimen C,1315,65963,1264,6042,42,Retired,Other,...,Regular,Always,7,Yes,295,Lost to followups,52.173913,761,5500,1
3,Male,Yes,Regimen A,848,79956,1163,50047,47,Other,Primary education,...,Regular,Often,24,No,4425,Transferred out,47.826087,1801,3480,1
4,Male,No,Regimen C,1031,62308,854,143833,19,Student,Tertiary education,...,,Never,8,Yes,30,Transferred out,65.217391,0,405,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57587,Female,Yes,Regimen A,466,15433,723,125612,31,Student,Secondary education,...,Regular,Sometimes,24,No,8567,Lost to followups,60.869565,0,8957,1
57588,Male,Yes,Regimen B,417,186293,941,141698,21,Employed,Tertiary education,...,Occasional,Sometimes,28,Yes,867,Transferred out,34.782609,0,0,1
57589,Female,No,Regimen A,268,96406,752,77336,56,Retired,Primary education,...,Regular,Sometimes,7,No,3096,Lost to followups,60.869565,4598,7570,1
57590,Female,No,Regimen B,1108,88934,1227,82175,22,Other,Primary education,...,Regular,Often,8,Yes,120,Transferred out,26.086957,610,0,0


In [18]:
data.dtypes

Gender                              object
Previous ART exposure               object
Current ART regimen                 object
CD4 count at first visit             int64
Viral load at first visit            int64
CD4 count at most recent visit       int64
Viral load at most recent visit      int64
Age at first visit                   int64
Employment status                   object
Education level                     object
Income level                        object
Marital status                      object
Substance use history               object
Comorbidities                       object
Reported symptoms                   object
Dietary habits                      object
Physical activity                   object
Adherence to ART                    object
Missed doses in the last month       int64
Adverse event                       object
Duration of followups                int64
Reason for exit                     object
Risk Score                         float64
DurationToE

In [19]:
#One-hot encoding categorical features
data = pd.get_dummies(data, columns=['Gender', 'Marital status', 'Education level', 'Adherence to ART', 'Comorbidities', 'Previous ART exposure', 'Current ART regimen', 'Income level', 'Dietary habits', 'Physical activity', 'Substance use history', 'Reported symptoms', 'Adverse event', 'Employment status'], drop_first=True)

In [20]:
data.dtypes

CD4 count at first visit                 int64
Viral load at first visit                int64
CD4 count at most recent visit           int64
Viral load at most recent visit          int64
Age at first visit                       int64
Missed doses in the last month           int64
Duration of followups                    int64
Reason for exit                         object
Risk Score                             float64
DurationToExitDate                       int64
DurationHIVPositive                      int64
Risk Category                            int64
Gender_Male                               bool
Marital status_Married                    bool
Marital status_Other                      bool
Marital status_Single                     bool
Marital status_Widowed                    bool
Education level_Primary education         bool
Education level_Secondary education       bool
Education level_Tertiary education        bool
Adherence to ART_Never                    bool
Adherence to 

In [21]:
data['Gender_Male'] = data['Gender_Male'].astype(int)

In [22]:
data['Marital status_Married'] = data['Marital status_Married'].astype(int)

In [23]:
data['Marital status_Other'] = data['Marital status_Other'].astype(int)

In [24]:
data['Marital status_Single'] = data['Marital status_Single'].astype(int)

In [25]:
data['Marital status_Widowed'] = data['Marital status_Widowed'].astype(int)

In [26]:
data['Education level_Primary education'] = data['Education level_Primary education'].astype(int)

In [27]:
data['Education level_Secondary education'] = data['Education level_Secondary education'].astype(int)

In [28]:
data['Education level_Tertiary education'] = data['Education level_Tertiary education'].astype(int)

In [29]:
data['Adherence to ART_Never'] = data['Adherence to ART_Never'].astype(int)

In [30]:
data['Adherence to ART_Often'] = data['Adherence to ART_Often'].astype(int)

In [31]:
data['Adherence to ART_Rarely'] = data['Adherence to ART_Rarely'].astype(int)

In [32]:
data['Adherence to ART_Sometimes'] = data['Adherence to ART_Sometimes'].astype(int)

In [33]:
data['Comorbidities_Diabetes'] = data['Comorbidities_Diabetes'].astype(int)

In [34]:
data['Comorbidities_Hepatitis B/C'] = data['Comorbidities_Hepatitis B/C'].astype(int)

In [35]:
data['Comorbidities_Hypertension'] = data['Comorbidities_Hypertension'].astype(int)

In [36]:
data['Comorbidities_None'] = data['Comorbidities_None'].astype(int)

In [37]:
data['Comorbidities_Other'] = data['Comorbidities_Other'].astype(int)

In [38]:
data['Comorbidities_Tuberculosis'] = data['Comorbidities_Tuberculosis'].astype(int)

In [39]:
data['Previous ART exposure_Yes'] = data['Previous ART exposure_Yes'].astype(int)

In [40]:
data['Current ART regimen_Regimen B'] = data['Current ART regimen_Regimen B'].astype(int)

In [41]:
data['Current ART regimen_Regimen C'] = data['Current ART regimen_Regimen C'].astype(int)

In [42]:
data['Income level_Low'] = data['Income level_Low'].astype(int)

In [43]:
data['Income level_Medium'] = data['Income level_Medium'].astype(int)

In [44]:
data['Income level_Prefer not to say'] = data['Income level_Prefer not to say'].astype(int)

In [45]:
data['Dietary habits_Healthy'] = data['Dietary habits_Healthy'].astype(int)

In [46]:
data['Dietary habits_Poor'] = data['Dietary habits_Poor'].astype(int)

In [47]:
data['Physical activity_Occasional'] = data['Physical activity_Occasional'].astype(int)

In [48]:
data['Physical activity_Regular'] = data['Physical activity_Regular'].astype(int)

In [49]:
data['Substance use history_Illicit drugs'] = data['Substance use history_Illicit drugs'].astype(int)

In [50]:
data['Substance use history_None'] = data['Substance use history_None'].astype(int)

In [51]:
data['Substance use history_Other'] = data['Substance use history_Other'].astype(int)

In [52]:
data['Substance use history_Tobacco'] = data['Substance use history_Tobacco'].astype(int)

In [53]:
data['Reported symptoms_Fever'] = data['Reported symptoms_Fever'].astype(int)

In [54]:
data['Reported symptoms_Night sweats'] = data['Reported symptoms_Night sweats'].astype(int)

In [55]:
data['Reported symptoms_None'] = data['Reported symptoms_None'].astype(int)

In [56]:
data['Reported symptoms_Other'] = data['Reported symptoms_Other'].astype(int)

In [57]:
data['Reported symptoms_Weight loss'] = data['Reported symptoms_Weight loss'].astype(int)

In [58]:
data['Adverse event_Yes'] = data['Adverse event_Yes'].astype(int)

In [59]:
data['Employment status_Other'] = data['Employment status_Other'].astype(int)

In [60]:
data['Employment status_Retired'] = data['Employment status_Retired'].astype(int)

In [61]:
data['Employment status_Student'] = data['Employment status_Student'].astype(int)

In [62]:
data['Employment status_Unemployed'] = data['Employment status_Unemployed'].astype(int)

In [63]:
from sklearn.model_selection import train_test_split

# Features and target variable for classification
X = data.drop(columns=['Reason for exit'])  # Exclude target and non-feature columns
y = data['Risk Category']

# Split into train and test sets
X_train, X_test, y_train_classification, y_test_classification = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_classification)

# Check class distribution after resampling
print("Class distribution after resampling:", y_train_resampled.value_counts())

Class distribution after resampling: Risk Category
1    36585
0    36585
2    36585
Name: count, dtype: int64


In [65]:
#Train XGBoost Classifier
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_resampled, y_train_resampled)

In [66]:
# Create separate event indicators for each type of event
data['Event_Died'] = data['Reason for exit'].apply(lambda x: 1 if x == 'Died' else 0)
data['Event_Lost_to_followups'] = data['Reason for exit'].apply(lambda x: 1 if x == 'Lost to followups' else 0)
data['Event_Transferred_out'] = data['Reason for exit'].apply(lambda x: 1 if x == 'Transferred out' else 0)
data['Event_Other'] = data['Reason for exit'].apply(lambda x: 1 if x == 'Other' else 0)

In [67]:
# Create a multi-state event indicator
def event_indicator(row):
    if row['Event_Died'] == 1:
        return 1
    elif row['Event_Lost_to_followups'] == 1:
        return 2
    elif row['Event_Transferred_out'] == 1:
        return 3
    elif row['Event_Other'] == 1:
        return 4
    else:
        return 0

data['Event'] = data.apply(event_indicator, axis=1)

In [68]:
# Define features (covariates)
X_survival = data.drop(columns=[ 'DurationToExitDate', 'Event_Died', 'Event_Lost_to_followups', 'Event_Transferred_out', 'Event_Other', 'Reason for exit', 'Event'])

# Define target variables for survival analysis
duration = data['DurationToExitDate']
event = data['Event']

# Verify the prepared data
print(X_survival.head())
print(duration.head())
print(event.head())

   CD4 count at first visit  Viral load at first visit  \
0                       372                     184642   
1                      1430                     166189   
2                      1315                      65963   
3                       848                      79956   
4                      1031                      62308   

   CD4 count at most recent visit  Viral load at most recent visit  \
0                             228                           187361   
1                             852                           181030   
2                            1264                             6042   
3                            1163                            50047   
4                             854                           143833   

   Age at first visit  Missed doses in the last month  Duration of followups  \
0                  48                              30                   5461   
1                  23                              16                 

In [69]:
from lifelines import CoxPHFitter

# Combine features and target variables into one DataFrame for lifelines
survival_data = X_survival.copy()
survival_data['Duration'] = duration
survival_data['Event'] = (event == 1).astype(int)

In [70]:
# Calculate the correlation matrix
correlation_matrix = survival_data.corr()

In [71]:
# Identify features with high correlation (absolute value > 0.9)
high_corr_pairs = np.where(np.abs(correlation_matrix) > 0.9)
high_corr_pairs = [(correlation_matrix.index[x], correlation_matrix.columns[y]) for x, y in zip(*high_corr_pairs) if x != y and x < y]

In [72]:
# Drop one of each pair of highly correlated features
features_to_drop = set()
for pair in high_corr_pairs:
    features_to_drop.add(pair[1])

In [73]:
# Drop highly correlated features
survival_data = survival_data.drop(columns=list(features_to_drop))

In [74]:
# Train Cox Proportional Hazards model with reduced data
cph = CoxPHFitter()
cph.fit(survival_data, duration_col='Duration', event_col='Event')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'Duration'
event col,'Event'
baseline estimation,breslow
number of observations,57592
number of events observed,14492
partial log-likelihood,-143481.19
time fit was run,2024-06-05 03:21:18 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
CD4 count at first visit,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.73,0.47,1.1
Viral load at first visit,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.3,0.77,0.39
CD4 count at most recent visit,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.46,0.65,0.62
Viral load at most recent visit,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.24,0.81,0.31
Age at first visit,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-4.36,<0.005,16.26
Missed doses in the last month,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-1.01,0.31,1.69
Duration of followups,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,15.48,<0.005,177.23
Risk Score,0.01,1.01,0.0,0.0,0.01,1.0,1.01,0.0,2.33,0.02,5.64
DurationHIVPositive,-0.0,1.0,0.0,-0.0,-0.0,1.0,1.0,0.0,-51.64,<0.005,inf
Risk Category,-0.01,0.99,0.03,-0.07,0.05,0.93,1.05,0.0,-0.46,0.65,0.63

0,1
Concordance,0.67
Partial AIC,287066.37
log-likelihood ratio test,4915.86 on 52 df
-log2(p) of ll-ratio test,inf


In [75]:
from sklearn.preprocessing import StandardScaler

# Train ensemble model
def train_ensemble_model(X_train_resampled, y_train_resampled, survival_data_reduced):
    # Standardize the training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    
    # Train XGBoost Classifier
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train_scaled, y_train_resampled)
    
    # Train Cox Proportional Hazards model with reduced data
    cph = CoxPHFitter()
    cph.fit(survival_data_reduced, duration_col='Duration', event_col='Event')
    
    return xgb, cph, scaler

In [76]:
# Predict risk category, risk probability, and survival time
def predict_risk_and_survival(patient_data, xgb, cph, scaler):
    # Standardize the patient data
    patient_data_scaled = scaler.transform(patient_data)
    
    # Predict risk category and probability using the XGBoost classifier
    risk_category_encoded = xgb.predict(patient_data_scaled)
    risk_probability = xgb.predict_proba(patient_data_scaled)[:, 1][0]
    
    # Predict survival time using the Cox model
    survival_time_prediction = cph.predict_expectation(patient_data).values[0]
    
    # Convert risk category encoded to actual category
    risk_category = 'Low Risk' if risk_category_encoded == 0 else 'Medium Risk' if risk_category_encoded == 1 else 'High Risk'
    
    # Interventions dictionary
    interventions = {
        'Low Risk': [
            'Regular Follow-ups (every 3-6 months)',
            'Health Education',
            'Support Groups',
            'Medication Refill Reminders'
        ],
        'Medium Risk': [
            'Increased Frequency of Check-ups (every 1-3 months)',
            'Counseling Services',
            'Home Visits',
            'Enhanced Medication Reminders',
            'Nutritional Support',
            'Transportation Assistance'
        ],
        'High Risk': [
            'Intensive Case Management',
            'Directly Observed Therapy (DOT)',
            'Mental Health Services',
            'Social Support Services',
            'Emergency Medical Services',
            'Customized Adherence Plans',
            'Intensive Nutritional Support',
            'Family Involvement'
        ]
    }
    
    return risk_category, risk_probability, survival_time_prediction, interventions[risk_category]

In [77]:
# Train the ensemble model
xgb, cph, scaler = train_ensemble_model(X_train_resampled, y_train_resampled, survival_data)

In [78]:
X_test

Unnamed: 0,CD4 count at first visit,Viral load at first visit,CD4 count at most recent visit,Viral load at most recent visit,Age at first visit,Missed doses in the last month,Duration of followups,Risk Score,DurationToExitDate,DurationHIVPositive,...,Reported symptoms_Fever,Reported symptoms_Night sweats,Reported symptoms_None,Reported symptoms_Other,Reported symptoms_Weight loss,Adverse event_Yes,Employment status_Other,Employment status_Retired,Employment status_Student,Employment status_Unemployed
49654,1045,56476,1390,18576,33,30,3062,65.217391,6590,8398,...,0,0,1,0,0,0,0,0,1,0
18655,350,21580,1263,29770,22,2,2452,52.173913,4226,3990,...,1,0,0,0,0,1,0,0,0,1
37561,1443,18665,238,23719,25,25,2621,56.521739,0,3006,...,0,1,0,0,0,1,1,0,0,0
1583,563,108956,467,5360,27,7,7890,56.521739,0,9439,...,1,0,0,0,0,1,0,0,0,0
29512,370,36459,1398,32724,20,10,6,56.521739,311,435,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11861,1182,199243,1060,44775,18,12,4434,43.478261,0,7,...,0,0,1,0,0,0,1,0,0,0
579,904,184314,268,199379,20,11,7853,78.260870,0,0,...,0,0,0,1,0,1,0,0,0,0
14857,352,64357,767,139308,63,15,542,47.826087,1329,975,...,0,0,1,0,0,0,0,0,0,1
49415,979,12613,703,138774,20,12,2709,52.173913,10110,3464,...,0,0,0,1,0,0,0,0,0,1


In [79]:
# Usage
patient_data = X_test.iloc[18:19]  #patient data
risk_category, risk_probability, survival_time_prediction, intervention_plan = predict_risk_and_survival(patient_data, xgb, cph, scaler)

print(f"Risk Category: {risk_category}")
print(f"Risk Probability: {risk_probability:.8f}")
print(f"Survival Time Prediction (days): {survival_time_prediction:.2f}")
print(f"Intervention Plan: {intervention_plan}")

Risk Category: Medium Risk
Risk Probability: 0.99997997
Survival Time Prediction (days): 7714.93
Intervention Plan: ['Increased Frequency of Check-ups (every 1-3 months)', 'Counseling Services', 'Home Visits', 'Enhanced Medication Reminders', 'Nutritional Support', 'Transportation Assistance']


In [80]:
import pickle

In [81]:
#Save the models
with open('xgb.pkl', 'wb') as f:
    pickle.dump(xgb, f)

with open('cph.pkl', 'wb') as f:
    pickle.dump(cph, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [82]:
X_train_resampled

Unnamed: 0,CD4 count at first visit,Viral load at first visit,CD4 count at most recent visit,Viral load at most recent visit,Age at first visit,Missed doses in the last month,Duration of followups,Risk Score,DurationToExitDate,DurationHIVPositive,...,Reported symptoms_Fever,Reported symptoms_Night sweats,Reported symptoms_None,Reported symptoms_Other,Reported symptoms_Weight loss,Adverse event_Yes,Employment status_Other,Employment status_Retired,Employment status_Student,Employment status_Unemployed
0,370,104499,421,59293,32,23,544,43.478261,535,3249,...,0,1,0,0,0,0,0,0,1,0
1,572,87772,235,100238,75,11,167,60.869565,2278,2361,...,0,0,1,0,0,1,1,0,0,0
2,603,38617,1041,150862,21,13,2690,21.739130,0,0,...,1,0,0,0,0,1,0,1,0,0
3,1442,98542,996,33319,26,20,679,47.826087,0,1107,...,1,0,0,0,0,0,0,0,0,1
4,750,31833,371,182198,32,6,14172,52.173913,0,3344,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109750,225,195296,1168,156986,66,0,1190,73.892705,422,9453,...,0,0,0,0,0,0,0,0,0,0
109751,1355,18960,347,120570,31,7,335,71.588575,136,2611,...,0,0,0,0,0,0,0,0,0,0
109752,1100,173274,775,73020,58,13,456,73.305729,1037,4076,...,0,0,0,0,1,0,0,0,0,0
109753,891,84766,376,38313,55,4,1628,72.851681,165,1964,...,0,0,0,0,0,0,0,0,0,1


In [83]:
import modelbit
mb = modelbit.login()

In [84]:
# add the model to the registry
#mb.add_model("train_ensemble_model", xgb)

In [85]:
mb.get_model("train_ensemble_model")

In [86]:
mb.deploy(train_ensemble_model)

ModelbitError: Not Authenticated

In [None]:
import types
import numpy as np

In [87]:
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

def predict_proba(self, X):
    pred = self.predict_expectation(X)
    return np.array([1-pred, pred]).T

In [None]:
cph.predict_proba = types.MethodType(predict_proba, cph)
explainer = ClassifierExplainer(cph, X_test, y_train_resampled, 
                               labels=['Duration','Intervention Plan'])

ExplainerDashboard(explainer).run(port=8052)

Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...


  0%|          | 0/11519 [00:00<?, ?it/s]