# CatBoost Analyis - R Kernel
[Return to Main Pipeline](../pgx_cohort_pipeline.ipynb)

## 1. Load Train/Test Datasets

In [11]:
quiet <- function(expr) {
  suppressMessages(suppressWarnings(eval(expr)))
}

In [12]:
quiet({
  library(duckdb)
  library(arrow)
  library(dplyr)

  # Setup DuckDB and S3 access
  con <- dbConnect(duckdb::duckdb(), dbdir = ":memory:")
  dbExecute(con, "INSTALL httpfs; LOAD httpfs;")
  dbExecute(con, "INSTALL aws; LOAD aws;")
  dbExecute(con, "CALL load_aws_credentials();")

  fs <- S3FileSystem$create()

  # === Define S3 Paths ===
  train_path <- "s3://pgxdatalake/catboost_models/non_opioid_ed/age_band=65-74/train/dataset.parquet"
  test_path  <- "s3://pgxdatalake/catboost_models/non_opioid_ed/age_band=65-74/test/dataset.parquet"

  # === Load Train and Test Data ===
  train_df <- read_parquet(train_path)
  test_df  <- read_parquet(test_path)

  # === View schema or preview ===
  print(glimpse(train_df))
  print(glimpse(test_df))

  # Optional: extract categorical levels or summary
  cat_vars <- names(train_df)[sapply(train_df, is.character)]
  print("Categorical variables:")
  print(cat_vars)
})

Rows: 454,453
Columns: 107
$ event_date            [3m[90m<dttm>[39m[23m 2017-03-07[90m, [39m2017-03-07[90m, [39m2017-03-10[90m, [39m2017-03-10[90m,[39m…
$ event_year            [3m[90m<int>[39m[23m 2017[90m, [39m2017[90m, [39m2017[90m, [39m2017[90m, [39m2017[90m, [39m2017[90m, [39m2017[90m, [39m2017[90m, [39m…
$ mi_person_key         [3m[90m<chr>[39m[23m "1000000185"[90m, [39m"1000000185"[90m, [39m"1000000185"[90m, [39m"10000…
$ drug_name             [3m[90m<chr>[39m[23m "tarka"[90m, [39m"atorvastatin_calcium"[90m, [39m"azithromycin"[90m,[39m…
$ target                [3m[90m<int>[39m[23m 0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m0[90m, [39m…
$ pattern_1             [3m[90m<chr>[39m[23m "0"[90m, [39m"9efb483a"[90m, [39m"0"[90m, [39m"0"[90m, [39m"0"[90m, [39m"0"[90

In [13]:
library(DBI)
library(duckdb)
library(jsonlite)

# Initialize DuckDB connection
con <- dbConnect(duckdb::duckdb(), dbdir = ":memory:")

# Enable S3 and JSON capabilities
dbExecute(con, "INSTALL json; LOAD json;")
dbExecute(con, "INSTALL httpfs; LOAD httpfs;")
dbExecute(con, "INSTALL aws; LOAD aws;")
dbExecute(con, "CALL load_aws_credentials();")

# Define S3 JSON path
s3_path <- "s3://pgxdatalake/catboost_models/non_opioid_ed/age_band=65-74/catboost_params_r.json"

# Read JSON file using DuckDB
query <- sprintf("SELECT * FROM read_json_auto('%s')", s3_path)
params_df <- dbGetQuery(con, query)

# Convert to named list for CatBoost
params_list <- as.list(params_df[1, ])

# Optional: print or inspect
print(params_list)

# Disconnect
dbDisconnect(con, shutdown = TRUE)


$learning_rate
[1] 0.2864326

$depth
[1] 6

$colsample_bylevel
[1] 0.7341253

$min_data_in_leaf
[1] 70

$l2_leaf_reg
[1] 1.72146

$iterations
[1] 1000

$grow_policy
[1] "Lossguide"

$boosting_type
[1] "Plain"

$bootstrap_type
[1] "MVS"

$early_stopping_rounds
[1] 50

$eval_metric
[1] "Logloss"

$random_seed
[1] 1997

$verbose
[1] 0

$age_band
[1] "65-74"



In [14]:
# Remove non-CatBoost parameters
params_list$age_band <- NULL
params_list$eval_metric <- "Logloss"
params_list$loss_function <- "Logloss"

# Ensure all parameters are properly formatted
params_list <- lapply(params_list, function(x) {
  if (is.list(x)) unlist(x) else x
})

# Confirm types
str(params_list)


List of 14
 $ learning_rate        : num 0.286
 $ depth                : num 6
 $ colsample_bylevel    : num 0.734
 $ min_data_in_leaf     : num 70
 $ l2_leaf_reg          : num 1.72
 $ iterations           : num 1000
 $ grow_policy          : chr "Lossguide"
 $ boosting_type        : chr "Plain"
 $ bootstrap_type       : chr "MVS"
 $ early_stopping_rounds: num 50
 $ eval_metric          : chr "Logloss"
 $ random_seed          : num 1997
 $ verbose              : num 0
 $ loss_function        : chr "Logloss"


In [15]:
# Drop unnecessary columns
drop_cols <- c("mi_person_key", "event_date", "event_year", "group_id", "__index_level_0__")
train_data <- train_df %>% select(-all_of(drop_cols))
test_data  <- test_df %>% select(-all_of(drop_cols))

# Step 1: Coerce character columns to factors
char_cols <- names(train_data)[sapply(train_data, is.character)]
train_data[char_cols] <- lapply(train_data[char_cols], as.factor)
test_data[char_cols]  <- lapply(test_data[char_cols], as.factor)

# Step 2: Identify categorical columns (factors)
categorical_cols <- names(train_data)[sapply(train_data, is.factor)]

# Step 3: Coerce all non-categorical features to numeric
numeric_cols <- setdiff(names(train_data), c("target", categorical_cols))
train_data[numeric_cols] <- lapply(train_data[numeric_cols], function(x) as.numeric(as.character(x)))
test_data[numeric_cols]  <- lapply(test_data[numeric_cols], function(x) as.numeric(as.character(x)))


## 2. CatBoost Pipeline

In [16]:
library(catboost)
library(dplyr)


# Remove target column to inspect features only
features_train <- select(train_data, -target)

# Convert all non-categorical columns to numeric (safely)
numeric_cols <- setdiff(names(features_train), categorical_cols)

train_data[numeric_cols] <- lapply(train_data[numeric_cols], function(x) as.numeric(as.character(x)))
test_data[numeric_cols]  <- lapply(test_data[numeric_cols], function(x) as.numeric(as.character(x)))


# === Define training and testing pools ===
train_pool <- catboost.load_pool(
  data = select(train_data, -target),
  label = train_data$target
)

test_pool <- catboost.load_pool(
  data = select(test_data, -target),
  label = test_data$target
)

# === Train final model ===
ed_non_opioid_model_cohort6 <- catboost.train(
  learn_pool = train_pool,
  test_pool = test_pool,
  params = params_list
)


## 3. Feature Importance

In [17]:
# Feature Importances
importances <- catboost.get_feature_importance(ed_non_opioid_model_cohort6, 
                                               pool = train_pool, 
                                               type = "FeatureImportance")

# === Format as data frame
feature_names <- colnames(train_data)[colnames(train_data) != "target"]
importance_df <- data.frame(
  feature_name = feature_names,
  importance = importances
)

# === Top 30 features
importance_df <- importance_df[order(-importance_df$importance), ]
print(head(importance_df, 30))


                             feature_name   importance
drug_name                       drug_name 8.855378e+01
pattern_1                       pattern_1 4.650160e+00
pattern_5                       pattern_5 1.448110e+00
pattern_2_lift             pattern_2_lift 1.368222e+00
pattern_1_lift             pattern_1_lift 1.049881e+00
pattern_2_support       pattern_2_support 5.429555e-01
pattern_1_certainty   pattern_1_certainty 4.954519e-01
pattern_2_confidence pattern_2_confidence 4.233358e-01
pattern_1_support       pattern_1_support 4.021109e-01
pattern_8                       pattern_8 2.468538e-01
pattern_2_certainty   pattern_2_certainty 2.304825e-01
pattern_2                       pattern_2 1.524628e-01
pattern_4                       pattern_4 1.423138e-01
pattern_18                     pattern_18 1.378250e-01
pattern_7                       pattern_7 9.985262e-02
pattern_16                     pattern_16 3.856357e-02
pattern_11                     pattern_11 7.674536e-03
pattern_9 

## 4. Save CatBoost Model with Metadata

In [18]:
model_params <- catboost.get_model_params(ed_non_opioid_model_cohort6)
is_symmetric <- model_params$tree_learner_options$grow_policy == "SymmetricTree"
is_symmetric

In [10]:
library(catboost)
library(jsonlite)
library(arrow)
library(paws)

# === AWS Setup ===
Sys.setenv("AWS_DEFAULT_REGION" = "us-east-1")
Sys.setenv("AWS_EC2_METADATA_DISABLED" = "FALSE")  # Required on EC2
s3 <- paws::s3()

# === Paths ===
s3_bucket <- "pgxdatalake"
s3_prefix <- "catboost_models/non_opioid_ed/age_band=65-74"
catboost_dir <- "/home/pgx3874/pgx-datasets/catboost_analysis/catboost_models/ed_non_opioid/cohort6"
dir.create(catboost_dir, recursive = TRUE, showWarnings = FALSE)

# === File Paths ===
tree_path <- file.path(catboost_dir, "tree_rules_r.json")
model_path <- file.path(catboost_dir, "catboost_model_r.cbm")
info_json_path <- file.path(catboost_dir, "catboost_model_info_r.json")
info_parquet_path <- file.path(catboost_dir, "catboost_model_info_r.parquet")

# === Save CatBoost model ===
catboost.save_model(ed_non_opioid_model_cohort6, model_path)
# tree_rules.json not supported in R; skip or do in Python if needed

# === Save model metadata ===
feature_names <- colnames(train_data)[colnames(train_data) != "target"]
importances <- catboost.get_feature_importance(ed_non_opioid_model_cohort6)

model_params <- catboost.get_model_params(ed_non_opioid_model_cohort6)
best_iteration <- if (!is.null(model_params$flat_params$best_iteration)) {
  as.numeric(model_params$flat_params$best_iteration)
} else {
  0
}
# fallback to 0 if not found

# === Save Model Info ===
model_info <- list(
  feature_names = feature_names,
  feature_importances = setNames(as.numeric(importances), feature_names),
  best_iteration = best_iteration,
  best_score = NULL
)


write_json(model_info, info_json_path, pretty = TRUE, auto_unbox = TRUE)

info_df <- data.frame(
  feature_name = feature_names,
  feature_importance = as.numeric(importances)
)
write_parquet(info_df, info_parquet_path)

# === Upload using paws ===
upload_with_paws <- function(local_path, s3_key) {
  file_size <- file.info(local_path)$size
  file_bin <- readBin(local_path, "raw", file_size)
  s3$put_object(
    Bucket = s3_bucket,
    Key = file.path(s3_prefix, s3_key),
    Body = file_bin
  )
  message(sprintf("✓ Uploaded %s to s3://%s/%s", s3_key, s3_bucket, file.path(s3_prefix, s3_key)))
}

# Upload model and metadata
upload_with_paws(model_path, "catboost_model_r.cbm")
upload_with_paws(info_json_path, "catboost_model_info_r.json")
upload_with_paws(info_parquet_path, "catboost_model_info_r.parquet")


# === Log Summary ===
cat("\nModel Information:\n")
cat("✓ All outputs saved to local path:", catboost_dir, "\n")
cat("✓ All outputs uploaded to: s3://", s3_bucket, "/", s3_prefix, "\n", sep = "")


✓ Uploaded catboost_model_r.cbm to s3://pgxdatalake/catboost_models/non_opioid_ed/age_band=65-74/catboost_model_r.cbm

✓ Uploaded catboost_model_info_r.json to s3://pgxdatalake/catboost_models/non_opioid_ed/age_band=65-74/catboost_model_info_r.json

✓ Uploaded catboost_model_info_r.parquet to s3://pgxdatalake/catboost_models/non_opioid_ed/age_band=65-74/catboost_model_info_r.parquet




Model Information:
✓ All outputs saved to local path: /home/pgx3874/pgx-datasets/catboost_analysis/catboost_models/ed_non_opioid/cohort6 
✓ All outputs uploaded to: s3://pgxdatalake/catboost_models/non_opioid_ed/age_band=65-74


# Back to Main Pipeline
[Return to Main Pipeline](../pgx_cohort_pipeline.ipynb)