In [1]:
%%configure -f
{
    "conf": {
        "spark.broadcast.compress": "true", 
        "spark.jars.packages": "ai.catboost:catboost-spark_3.5_2.12:1.2.7",
        "spark.jars.packages.resolve.transitive": "true",
        "spark.executor.memory": "180g",
        "spark.executor.cores": "1",   
        "spark.executorEnv.CATBOOST_WORKER_INIT_TIMEOUT": "3600s",
        "spark.executor.extraJavaOptions": "--add-exports java.base/sun.net.util=ALL-UNNAMED",
        "spark.executor.memoryOverhead": "8g",
        "spark.driver.extraJavaOptions": "--add-exports java.base/sun.net.util=ALL-UNNAMED",
        "spark.driver.memory": "45g",          
        "spark.dynamicAllocation.enabled": "true",
        "spark.dynamicAllocation.minExecutors": "2",
        "spark.dynamicAllocation.maxExecutors": "103",     
        "spark.network.timeout": "1200s",  
        "spark.rpc.askTimeout": "1200s", 
        "spark.rpc.message.maxSize": "512",
        "spark.sql.broadcastTimeout": "1200s",
        "spark.sql.session.timeout": "1200s",
        "spark.sql.shuffle.partitions": "103",
        "spark.sql.autoBroadcastJoinThreshold": "-1",
        "spark.shuffle.service.enabled": "true",
        "spark.task.cpus": "1",  
        "spark.yarn.am.memory": "45g"
    }
}

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import StructType, StructField, DoubleType, StringType
import catboost_spark

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1735853484859_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Adding a parameter tag
cohort = 'cohort6'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# S3 Paths
s3_bucket = f"s3://pgx-repository/ade-risk-model/Step5_Time_to_Event_Model/2_processed_datasets/{cohort}"
train_input_path = f"{s3_bucket}/train"
test_input_path = f"{s3_bucket}/test"

# Read processed train and test datasets from S3
print("Reading train and test datasets...")
train_df = spark.read.parquet(train_input_path)
test_df = spark.read.parquet(test_input_path)

print("Train and test datasets successfully loaded.")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Reading train and test datasets...
Train and test datasets successfully loaded.

In [5]:
# Verify output
print("Train Dataframe Schema:")
train_df.printSchema()
print("Test Dataframe Schema:")
test_df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Train Dataframe Schema:
root
 |-- mi_person_key: string (nullable = true)
 |-- drug_date: date (nullable = true)
 |-- ADE_Date: date (nullable = true)
 |-- standardized_drug_name: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- person_key_index: double (nullable = true)
 |-- drug_name_index: double (nullable = true)
 |-- drug_name_one_hot: vector (nullable = true)
 |-- features: vector (nullable = true)

Test Dataframe Schema:
root
 |-- mi_person_key: string (nullable = true)
 |-- drug_date: date (nullable = true)
 |-- ADE_Date: date (nullable = true)
 |-- standardized_drug_name: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- person_key_index: double (nullable = true)
 |-- drug_name_index: double (nullable = true)
 |-- drug_name_one_hot: vector (nullable = true)
 |-- features: vector (nullable = true)

In [7]:
# Repartition the DataFrames to match the number of instances (103)
train_df = train_df.repartition(103)
test_df = test_df.repartition(103)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# CatBoost Pool objects
from pyspark import StorageLevel

# Cache or persist the Spark DataFrames before creating the Pool
train_df = train_df.select("features", "label").persist(StorageLevel.MEMORY_AND_DISK)
test_df = test_df.select("features", "label").persist(StorageLevel.MEMORY_AND_DISK)

# Create the CatBoost Pool objects
train_pool = catboost_spark.Pool(train_df)
test_pool = catboost_spark.Pool(test_df)

# Confirm the DataFrames are cached/persisted
print(train_df.storageLevel)
print(test_df.storageLevel)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Disk Memory Serialized 1x Replicated
Disk Memory Serialized 1x Replicated

In [None]:
# Seeds for different runs - 10 models
seeds = [3, 24, 18, 17, 19, 11, 38, 74, 35, 90]

# Start model number tracker
model_num = 1

# Loop to train and save models (10 runs for stable feature selection)
for seed in seeds:
    print(f"Training model {model_num} with seed {seed}...")
    
    # Initialize CatBoost Spark Classifier with the current seed
    classifier = catboost_spark.CatBoostClassifier(randomSeed=seed)

    # Train the model
    model = classifier.fit(train_pool, evalDatasets=[test_pool])

    # Define the path to save the Spark model, including the model number
    spark_model_path = f"s3://pgx-repository/ade-risk-model/Step5_Time_to_Event_Model/4_models/{cohort}/spark_model_{model_num}"

    # Save the Spark model (with metadata)
    model.write().overwrite().save(spark_model_path)

    print(f"Spark model {model_num} with metadata saved to: {spark_model_path}")
    
    # Clean up memory for next run
    del classifier
    del model
    
    # Increment the model number for the next run
    model_num += 1

In [9]:
# Additional Seeds for miscellaneous or missing runs
seeds = [3]

# Start model number tracker
model_num = 1

# Loop to train and save models (10 runs for stable feature selection)
for seed in seeds:
    print(f"Training model {model_num} with seed {seed}...")
    
    # Initialize CatBoost Spark Classifier with the current seed
    classifier = catboost_spark.CatBoostClassifier(randomSeed=seed)

    # Train the model
    model = classifier.fit(train_pool, evalDatasets=[test_pool])

    # Define the path to save the Spark model, including the model number
    spark_model_path = f"s3://pgx-repository/ade-risk-model/Step5_Time_to_Event_Model/4_models/{cohort}/spark_model_{model_num}"

    # Save the Spark model (with metadata)
    model.write().overwrite().save(spark_model_path)

    print(f"Spark model {model_num} with metadata saved to: {spark_model_path}")
    
    # Clean up memory for next run
    del classifier
    del model
    
    # Increment the model number for the next run
    model_num += 1

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Exception in thread cell_monitor-8:
Traceback (most recent call last):
  File "/mnt/notebook-env/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/mnt/notebook-env/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/mnt/notebook-env/lib/python3.9/site-packages/awseditorssparkmonitoringwidget/cellmonitor.py", line 178, in cell_monitor
    job_binned_stages[job_id][stage_id] = all_stages[stage_id]
KeyError: 1733
An error was encountered:
Invalid status code '400' from http://localhost:8998/sessions/0/statements/8 with error payload: {"msg":"requirement failed: Session isn't active."}


In [None]:
train_df.unpersist()
test_df.unpersist()