In [None]:
%%configure -f
{
    "conf": {
        "spark.executor.memory": "24g",
        "spark.executor.cores": "4",       
        "spark.driver.memory": "24g",      
        "spark.yarn.am.memory": "4g",     
        "spark.dynamicAllocation.enabled": "true", 
        "spark.task.cpus": "4",          
        "spark.jars.packages.resolve.transitive": "true",
        "spark.executor.extraJavaOptions": "--add-exports java.base/sun.net.util=ALL-UNNAMED",
        "spark.driver.extraJavaOptions": "--add-exports java.base/sun.net.util=ALL-UNNAMED",
        "spark.network.timeout": "1200s",  
        "spark.rpc.askTimeout": "1200s", 
        "spark.executor.memoryOverhead": "4g"
    }
}

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import StructType, StructField, DoubleType, StringType

In [None]:
cohort_num = "9"

In [None]:
from pyspark.sql.functions import col

# Paths for input data
s3_bucket = f"s3://pgx-repository/ade-risk-model/Step5_Time_to_Event_Model/1_input_datasets/cohort{cohort_num}"
train_input_path = f"{s3_bucket}/train"
test_input_path = f"{s3_bucket}/test"

# Paths for feature importance CSV
feature_importance_path = "s3://pgx-repository/ade-risk-model/Step5_Time_to_Event_Model/5_feature_importances/cohort{cohort_num}/consolidated_feature_importances.csv"

# Read train and test datasets
train_df = spark.read.parquet(train_input_path)
test_df = spark.read.parquet(test_input_path)

train_df.printSchema()
test_df.printSchema()

In [None]:
# Read unique drug names from the CSV
drug_filter = spark.read.option("header", "true").csv(feature_importance_path)

# Extract the list of unique drug names
drug_names = drug_filter.select("drug_name").rdd.flatMap(lambda x: x).collect()

# Filter the train DataFrame
filtered_train_df = train_df.filter(
    (col("standardized_drug_names").isin(drug_names)) |  # Keep rows with important drug features
    (col("label") == 1) |                           # Keep rows with label == 1
    (col("hospitalization") == 1)                   # Keep rows with hospitalization == 1
)

# Filter the test DataFrame
filtered_test_df = test_df.filter(
    (col("standardized_drug_names").isin(important_drug_names)) |  # Keep rows with important drug features
    (col("label") == 1) |                           # Keep rows with label == 1
    (col("hospitalization") == 1)                   # Keep rows with hospitalization == 1
)

# Paths for saving filtered datasets to S3
filtered_train_output_path = f"{s3_bucket}/filtered_train"
filtered_test_output_path = f"{s3_bucket}/filtered_test"

# Save filtered DataFrames back to S3
filtered_train_df.write.mode("overwrite").parquet(filtered_train_output_path)
filtered_test_df.write.mode("overwrite").parquet(filtered_test_output_path)

print(f"Filtered train dataset saved to {filtered_train_output_path}")
print(f"Filtered test dataset saved to {filtered_test_output_path}")

In [None]:
%%python

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Convert Spark DataFrames to Pandas
train_pandas_df = filtered_train_df.toPandas()
test_pandas_df = filtered_test_df.toPandas()

# Sort by 'mi_person_key' and 'drug_date'
train_pandas = train_pandas_df.sort_values(by=['mi_person_key', 'drug_date'])
test_pandas = test_pandas_df.sort_values(by=['mi_person_key', 'drug_date'])


In [None]:
%%python

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
import optuna
from sklearn.metrics import recall_score, f1_score
import shap

# Preprocess the train and test data
X_Train_sorted = train_pandas.drop(columns=['label'])
Y = train_pandas['label']

X_Test_sorted = test_pandas.drop(columns=['label'])
y = test_pandas['label']

# Optuna objective function
def objective(trial):
    params = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("depth", 1, 6),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 12),
        "iterations": 500,
        "early_stopping_rounds": 50,
        "eval_metric": 'Recall',
        "cat_features": ['standardized_drug_name']  # Specify categorical features
    }

    # Create CatBoost pools with mi_person_key as group_id
    train_pool = Pool(
        data=X_Train_sorted.drop(columns=['drug_date']),
        label=Y,
        cat_features=['standardized_drug_name'],
        group_id=X_Train_sorted['mi_person_key']
    )
    test_pool = Pool(
        data=X_Test_sorted.drop(columns=['drug_date']),
        label=y,
        cat_features=['standardized_drug_name'],
        group_id=X_Test_sorted['mi_person_key']
    )

    # Train the model
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=test_pool, verbose=0)

    # Make predictions
    preds = model.predict(X_Test_sorted.drop(columns=['drug_date']))
    recall = recall_score(y, preds)
    f1 = f1_score(y, preds)
    return (recall + f1) / 2  # Combined metric

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Train the final model with the best parameters
best_params = study.best_trial.params
final_model = CatBoostClassifier(**best_params)
train_pool = Pool(
    data=X_Train_sorted.drop(columns=['drug_date']),
    label=Y,
    cat_features=['standardized_drug_name'],
    group_id=X_Train_sorted['mi_person_key']
)
test_pool = Pool(
    data=X_Test_sorted.drop(columns=['drug_date']),
    label=y,
    cat_features=['standardized_drug_name'],
    group_id=X_Test_sorted['mi_person_key']
)
final_model.fit(train_pool, eval_set=test_pool, verbose=100)

# Calculate SHAP values for the final model
shap_values = final_model.get_feature_importance(type='ShapValues', data=test_pool)
shap_df = pd.DataFrame(shap_values[:, :-1], columns=X_Test_sorted.drop(columns=['drug_date']).columns)

# Group SHAP values for each level of standardized_drug_name
level_shap_df = X_Test_sorted[['standardized_drug_name']].copy()
level_shap_df['SHAP'] = shap_df['standardized_drug_name']

# Aggregate SHAP values by drug levels
level_importance = level_shap_df.groupby('standardized_drug_name')['SHAP'].mean().abs().sort_values(ascending=False)

# Output the SHAP importance by individual levels
print("SHAP importance by standardized_drug_name levels:")
print(level_importance)


In [None]:
%%python

# Extract the best parameters from the Optuna study
best_params = study.best_trial.params

# Add any additional fixed parameters needed for the final model
best_params.update({
    "iterations": 1000,            # Or the number of iterations you desire
    "boosting_type": "Ordered",    # Keep it consistent with the Optuna runs
    "bootstrap_type": "MVS",       # As used during the trial
    "early_stopping_rounds": 100,  # Optional, if you want early stopping
    "eval_metric": 'Recall'        # Consistent evaluation metric
})

cat_features = ['standardized_drug_name']
group_id = 'mi_person_key'

# Convert train and test datasets to CatBoost Pools
final_x_train = X_Train_sorted.drop(['mi_person_key', 'drug_date'], axis=1)
final_y_train = train_pandas['label']

final_x_test = X_Test_sorted.drop(['mi_person_key', 'drug_date'], axis=1)
final_y_test =  test_pandas['label']

final_train_pool = Pool(final_x_train, final_y_train, cat_features=cat_features, group_id=group_id)
final_test_pool = Pool(final_x_test, final_y_test, cat_features=cat_features, group_id=group_id)

# Train the final CatBoost model with the best parameters
final_model = CatBoostClassifier(**best_params)
final_model.fit(
    final_train_pool,
    eval_set=final_test_pool,
    early_stopping_rounds=100,  
    verbose=100         
)

# Save the final model if needed
final_model.save_model(f"final_cohort{cohort_num}_model.cbm")

# Evaluate the model
final_predictions = final_model.predict(final_test_pool.get_features())
final_recall = recall_score(test_df['label'], final_predictions.round().astype(int))
final_f1 = f1_score(test_df['label'], final_predictions.round().astype(int))

print(f"Final Model Recall: {final_recall}")
print(f"Final Model F1 Score: {final_f1}")


In [None]:
%%python

from catboost import CatBoostClassifier, Pool
import pandas as pd

# Extract the best parameters from the Optuna study
best_params = study.best_trial.params

# Add any additional fixed parameters needed for the final model
best_params.update({
    "iterations": 1000,            # Or the number of iterations you desire
    "boosting_type": "Ordered",    # Keep it consistent with the Optuna runs
    "bootstrap_type": "MVS",       # As used during the trial
    "early_stopping_rounds": 100,  # Optional, if you want early stopping
    "eval_metric": 'Recall'        # Consistent evaluation metric
})

# Prepare train and test datasets
X_Train_sorted = train_pandas.drop(columns=['label'])
Y = train_pandas['label']

X_Test_sorted = test_pandas.drop(columns=['label'])
y = test_pandas['label']

# Create CatBoost Pools with group_id
train_pool = Pool(
    data=X_Train_sorted.drop(columns=['drug_date', 'mi_person_key']),
    label=Y,
    cat_features=['standardized_drug_name'],
    group_id=X_Train_sorted['mi_person_key']
)
test_pool = Pool(
    data=X_Test_sorted.drop(columns=['drug_date', 'mi_person_key']),
    label=y,
    cat_features=['standardized_drug_name'],
    group_id=X_Test_sorted['mi_person_key']
)

# Train the CatBoost model
model = CatBoostClassifier(**best_params)
model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100,  verbose=100)

# Compute SHAP interaction values
shap_interaction_values = model.get_feature_importance(type='ShapInteractionValues', data=test_pool)


In [None]:
%%python

# Save the final model if needed
final_model.save_model(f"final_cohort{cohort_num}_model.cbm")

# Evaluate the model
final_predictions = final_model.predict(final_test_pool.get_features())
final_recall = recall_score(test_df['label'], final_predictions.round().astype(int))
final_f1 = f1_score(test_df['label'], final_predictions.round().astype(int))

print(f"Final Model Recall: {final_recall}")
print(f"Final Model F1 Score: {final_f1}")

In [None]:
%%python

import numpy as np

# Extract self-interaction values (diagonal)
self_interactions = shap_interaction_values[:, 0, 0]  # Diagonal of SHAP interaction matrix

# Combine with test data
interaction_df = X_Test_sorted[['group_id', 'standardized_drug_name']].copy()
interaction_df['Self_SHAP_Interaction'] = self_interactions

# Filter out rows with zero SHAP contributions
interaction_df_filtered = interaction_df[interaction_df['Self_SHAP_Interaction'] != 0]

# Sort the DataFrame in descending order of SHAP values
interaction_df_sorted = interaction_df_filtered.sort_values(
    by='Self_SHAP_Interaction', ascending=False
)

# Display the top SHAP values
print("Top SHAP self-interaction values sorted in descending order:")
print(interaction_df_sorted)

# Save the sorted SHAP DataFrame to a CSV
interaction_df_sorted.to_csv("drugs_final.csv", index=False)

In [None]:
%%python

import boto3
import os

# Define AWS S3 bucket and file details
bucket_name = 'pgx-repository'  
file_name = 'drugs_final.csv'  
s3_key = f"ade-risk-model/Step5_Time_to_Event_Model/5_feature_importances/cohort{cohort_num}/{file_name}"  

# Initialize an S3 client
s3_client = boto3.client('s3')

# Upload the file
try:
    s3_client.upload_file(file_name, bucket_name, s3_key)
    print(f"File '{file_name}' successfully uploaded to S3 bucket '{bucket_name}' as '{s3_key}'.")
except Exception as e:
    print(f"An error occurred: {e}")
