Copied from M4 Modality version, 4/13



In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle
import os

from torch.utils.data import Dataset, DataLoader
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Read Data

In [None]:
dir = ''

In [None]:
# Load data
df_train = pd.read_csv(dir + '/trials_train.csv')
df_val = pd.read_csv(dir + '/trials_val.csv')
df_test = pd.read_csv(dir + '/trials_test.csv')

df_train.head()

Unnamed: 0,id,start_date,status,why_stopped,hasResults,phase,allocation,intervention_model,primary_purpose,acc_text,...,text_outcomes,text_criteria,dmc_oversight,fda_drug,fda_device,unapproved_device,ae_score,sae_events,other_ae_events,stringency_index
0,NCT05099822,2020-03-13,TERMINATED,Business objectives changed.,False,PHASE1,RANDOMIZED,SEQUENTIAL,TREATMENT,"This study aims to evaluate the safety, tolera...",...,Primary Outcomes: \n1. Measure: Incidence of A...,"Inclusion Criteria:\n* In good health, as dete...",False,False,False,,,,,30.09
1,NCT05225870,2021-01-01,COMPLETED,,False,,,SINGLE_GROUP,BASIC_SCIENCE,Colorectal carcinoma is one of the most aggres...,...,Primary Outcomes: \n1. Measure: immunohistoche...,Inclusion Criteria:\n* patients with colorecta...,True,False,False,,,,,71.76
2,NCT05617417,2021-05-05,COMPLETED,,False,,,SINGLE_GROUP,TREATMENT,We aimed to evaluate the efficacy of locally a...,...,Primary Outcomes: \n1. Measure: Changes in uri...,Inclusion Criteria:\n* The patient who has pur...,False,False,False,,,,,53.98
3,NCT03696576,2018-09-20,TERMINATED,"Due to the pandemic, recruitment ended earlier...",True,,RANDOMIZED,PARALLEL,TREATMENT,The larynx and vocal folds undergo many age-re...,...,Primary Outcomes: \n1. Measure: Voice Handicap...,Inclusion Criteria:\n* Age 65 or older\n* Diag...,False,False,True,,0.0,,,0.0
4,NCT02400723,2018-12-05,COMPLETED,,True,,RANDOMIZED,PARALLEL,TREATMENT,"Anxiety leads to poor quality of life, avoidan...",...,Primary Outcomes: \n1. Measure: Change in Anxi...,Inclusion Criteria:\n* Veterans aged 60 years ...,False,False,False,,0.005893,,{'Musculoskeletal and connective tissue disord...,0.0


## Get Trials With Results

In [None]:
# Filter to only trials with posted results
df_train = df_train[df_train['hasResults'] == True]
df_val = df_val[df_val['hasResults'] == True]
df_test = df_test[df_test['hasResults'] == True]

# Load Embedding Model

Special tokens
- CLS - 101
- SEP - 102

**STEP:** Set embedding input size.

In [None]:
#SET
emb_input_size = 512

In [None]:
# Check if CUDA (GPU) is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BioBERT
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bb_model = AutoModel.from_pretrained(model_name)

# Move the model to GPU if available, otherwise CPU
bb_model = bb_model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

# Data Preprocess

## Target Terminated
Create binary classification target indicator

In [None]:
def set_targets_terminated(df):
  status_mapping = {'COMPLETED': 0, 'TERMINATED': 1}  # Map strings to 0 and 1
  df.loc[:, 'terminated'] = df['status'].map(status_mapping)  # Create a new numeric column
  return df

## Target AE Risk Score

Check that risk score exists for hasResults == True


In [None]:
train_missing = len(df_train[df_train['ae_score'].isna()])
val_missing = len(df_val[df_val['ae_score'].isna()])
test_missing = len(df_test[df_test['ae_score'].isna()])

print(f"Train -- Missing ae score rows: {train_missing}")
print(f"Val -- Missing ae score rows: {val_missing}")
print(f"Test -- Missing ae score rows: {test_missing}")

assert train_missing + val_missing + test_missing == 0, "Missing ae score rows"

Train -- Missing ae score rows: 0
Val -- Missing ae score rows: 0
Test -- Missing ae score rows: 0


Process Risk Score. Add ceiling to range [0, 1.33]

Weight is set on 1 "Serious" AE and 1 "Other" AE per trial participant. 

In [None]:
##SET
ceiling=1.33

In [None]:
def cap_targets_ae_score(df, ceiling = ceiling):
  df['ae_score'] = df['ae_score'].clip(upper=ceiling)
  return df

## Date

In [None]:
def preprocess_date(df, date_col = 'start_date'):
    """Extracts month, year, cyclical month, and month-year interaction."""

    # Create a copy of the DataFrame
    df_copy = df.copy()

    # Convert the date column to datetime objects
    df_copy[date_col] = pd.to_datetime(df_copy[date_col])

    # Extract month and year
    df_copy['month'] = df_copy[date_col].dt.month
    df_copy['year'] = df_copy[date_col].dt.year

    # Cyclical encoding for month
    df_copy['month_sin'] = np.sin(2 * np.pi * df_copy['month'] / 12)
    df_copy['month_cos'] = np.cos(2 * np.pi * df_copy['month'] / 12)

    # Drop intermediate columns
    df_copy = df_copy.drop(columns=['month'])

    return df_copy

## Text
1. Tokenize each individual text section and create input IDs. Leave space for special tokens.
2. Add special tokens CLS and SEP
3. Add padding and attention masks




In [None]:
# Use this one to create input ids and attention mask for BioBERT unfrozen training
def tokenize_text_sections(df, tokenizer=tokenizer, max_length=emb_input_size, batch_size=128):

    all_intro_input_ids = []
    all_intro_attention_masks = []
    all_outcomes_input_ids = []
    all_outcomes_attention_masks = []
    all_criteria_input_ids = []
    all_criteria_attention_masks = []


    # Create batch
    for i in range(0, len(df), batch_size):
        batch_df = df.iloc[i:i + batch_size].copy() # Batch dataframe
        text_intros_batch = batch_df['text_intro'].tolist() # Batch of text rows
        text_outcomes_batch = batch_df['text_outcomes'].tolist()
        text_criteria_batch = batch_df['text_criteria'].tolist()

        # Handle potential NaN values from reading CSV default behavior
        text_intros_batch = [str(text) if not pd.isnull(text) else '' for text in text_intros_batch]
        text_outcomes_batch = [str(text) if not pd.isnull(text) else '' for text in text_outcomes_batch]
        text_criteria_batch = [str(text) if not pd.isnull(text) else '' for text in text_criteria_batch]

        # Batch tokenize the texts.
        # Tokenize intros
        encoded_intro_batch = tokenizer(text_intros_batch,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,  # Let tokenizer add CLS and SEP
            return_attention_mask=True,
        )
        all_intro_input_ids.extend(encoded_intro_batch['input_ids'])
        all_intro_attention_masks.extend(encoded_intro_batch['attention_mask'])

        # Tokenize outcomes
        encoded_outcomes_batch = tokenizer(text_outcomes_batch,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,  # Let tokenizer add CLS and SEP
            return_attention_mask=True,
        )
        all_outcomes_input_ids.extend(encoded_outcomes_batch['input_ids'])
        all_outcomes_attention_masks.extend(encoded_outcomes_batch['attention_mask'])

        # Tokenize criteria
        encoded_criteria_batch = tokenizer(text_criteria_batch,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,  # Let tokenizer add CLS and SEP
            return_attention_mask=True,
        )
        all_criteria_input_ids.extend(encoded_criteria_batch['input_ids'])
        all_criteria_attention_masks.extend(encoded_criteria_batch['attention_mask'])


    df['intro_input_ids'] = all_intro_input_ids
    df['intro_attention_mask'] = all_intro_attention_masks
    df['outcomes_input_ids'] = all_outcomes_input_ids
    df['outcomes_attention_mask'] = all_outcomes_attention_masks
    df['criteria_input_ids'] = all_criteria_input_ids
    df['criteria_attention_mask'] = all_criteria_attention_masks

    return df

## Run Preprocess Steps

In [None]:
# Set targets terminated
df_train = set_targets_terminated(df_train)
df_val = set_targets_terminated(df_val)
df_test = set_targets_terminated(df_test)

# Set targets column to data splits
df_train = cap_targets_ae_score(df_train)
df_val = cap_targets_ae_score(df_val)
df_test = cap_targets_ae_score(df_test)

# Preproces date
df_train = preprocess_date(df_train)
df_val = preprocess_date(df_val)
df_test = preprocess_date(df_test)

# Tokenize, chunk
df_train = tokenize_text_sections(df_train)
df_val = tokenize_text_sections(df_val)
df_test = tokenize_text_sections(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'terminated'] = df['status'].map(status_mapping)  # Create a new numeric column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ae_score'] = df['ae_score'].clip(upper=ceiling)


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Will produce multiple chunk rows for each chunked trial. CLS tokens are to be aggregated, whether within the model or after, for classification.

## Prepare and Check
- Each token_id and attention mask length should be the set embedding input size, default 512.
- Each chunk should start with 101 and end with 102
- Padding and attention mask should be 0
- No chunking, the resulting number of rows should be equal to original


In [None]:
# #SET split
# df = df_train

# print(f"Embedding input size: {emb_input_size}\n\nCheck:")

# # Check
# print(f"Check max token ids length: {df['intro_input_ids'].apply(len).max()}")
# print(f"Check min token ids length: {df['intro_input_ids'].apply(len).min()}")
# print(f"Check max attention mask length: {df['intro_attention_mask'].apply(len).max()}")
# print(f"Check min attention mask length: {df['intro_attention_mask'].apply(len).min()}")

# print(f"Check max token ids length: {df['outcomes_input_ids'].apply(len).max()}")
# print(f"Check min token ids length: {df['outcomes_input_ids'].apply(len).min()}")
# print(f"Check max attention mask length: {df['outcomes_attention_mask'].apply(len).max()}")
# print(f"Check min attention mask length: {df['outcomes_attention_mask'].apply(len).min()}")

# print(f"Check max token ids length: {df['criteria_input_ids'].apply(len).max()}")
# print(f"Check min token ids length: {df['criteria_input_ids'].apply(len).min()}")
# print(f"Check max attention mask length: {df['criteria_attention_mask'].apply(len).max()}")
# print(f"Check min attention mask length: {df['criteria_attention_mask'].apply(len).min()}")

# # Check known example k
# k=0
# print(f"k = {k}")
# print(f"Check token_ids: {df['intro_input_ids'].iloc[k]}")
# print(f"Check attention_mask: {df['intro_attention_mask'].iloc[k]}")
# print(f"Check token_ids: {df['outcomes_input_ids'].iloc[k]}")
# print(f"Check attention_mask: {df['outcomes_attention_mask'].iloc[k]}")
# print(f"Check token_ids: {df['criteria_input_ids'].iloc[k]}")
# print(f"Check attention_mask: {df['criteria_attention_mask'].iloc[k]}")

# display(df)

Embedding input size: 512

Check:
Check max token ids length: 512
Check min token ids length: 512
Check max attention mask length: 512
Check min attention mask length: 512
Check max token ids length: 512
Check min token ids length: 512
Check max attention mask length: 512
Check min attention mask length: 512
Check max token ids length: 512
Check min token ids length: 512
Check max attention mask length: 512
Check min attention mask length: 512
k = 0
Check token_ids: [101, 1103, 2495, 15023, 1775, 1105, 5563, 17373, 13971, 1242, 1425, 118, 2272, 2607, 1107, 1147, 25445, 1105, 2401, 1115, 1169, 1730, 1106, 5576, 18766, 9739, 3154, 1113, 1103, 1490, 117, 1114, 2607, 1107, 1103, 19192, 1449, 7090, 1158, 1292, 16312, 1116, 119, 1292, 2607, 117, 1145, 1270, 3073, 13341, 21250, 1465, 117, 1169, 1138, 3021, 1260, 19091, 15595, 3154, 1113, 1103, 2491, 1104, 9808, 2833, 119, 1175, 1132, 1374, 2527, 1115, 1138, 17428, 1103, 1329, 1104, 1490, 7606, 3252, 6665, 1111, 1292, 4420, 119, 1103, 2425, 64

Unnamed: 0,id,start_date,status,why_stopped,hasResults,phase,allocation,intervention_model,primary_purpose,acc_text,...,terminated,year,month_sin,month_cos,intro_input_ids,intro_attention_mask,outcomes_input_ids,outcomes_attention_mask,criteria_input_ids,criteria_attention_mask
3,NCT03696576,2018-09-20,TERMINATED,"Due to the pandemic, recruitment ended earlier...",True,,RANDOMIZED,PARALLEL,TREATMENT,The larynx and vocal folds undergo many age-re...,...,1,2018,-1.000000e+00,-1.836970e-16,"[101, 1103, 2495, 15023, 1775, 1105, 5563, 173...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1425, 2625, 1137,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,NCT02400723,2018-12-05,COMPLETED,,True,,RANDOMIZED,PARALLEL,TREATMENT,"Anxiety leads to poor quality of life, avoidan...",...,0,2018,-2.449294e-16,1.000000e+00,"[101, 10507, 4501, 1106, 2869, 3068, 1104, 129...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 11461, 4079, 2539...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15,NCT04683939,2022-01-18,TERMINATED,Sponsor decision,True,PHASE1,NON_RANDOMIZED,SEQUENTIAL,TREATMENT,"This study was planned as an open-label, multi...",...,1,2022,5.000000e-01,8.660254e-01,"[101, 1142, 2025, 1108, 2919, 1112, 1126, 1501...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2501, 10838, 9173, 131, 1111, 1155, 2192...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
16,NCT03410914,2018-08-13,COMPLETED,,True,PHASE2,,SINGLE_GROUP,PREVENTION,Despite improvements and advances in pancreas ...,...,0,2018,-8.660254e-01,-5.000000e-01,"[101, 2693, 8313, 1105, 11823, 1107, 13316, 13...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 4533, 1106, 13971...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
26,NCT04749745,2020-06-09,COMPLETED,,True,EARLY_PHASE1,RANDOMIZED,CROSSOVER,OTHER,Major depressive disorder (MDD) is a serious m...,...,0,2020,1.224647e-16,-1.000000e+00,"[101, 1558, 1260, 16568, 8936, 113, 182, 13976...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 122, 119, 4457, 117, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47924,NCT03567616,2018-10-18,TERMINATED,Following results of the primary progression-f...,True,PHASE2,NON_RANDOMIZED,SEQUENTIAL,TREATMENT,"This was an open-label, multicenter study desi...",...,1,2018,-8.660254e-01,5.000000e-01,"[101, 1142, 1108, 1126, 1501, 118, 3107, 117, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1231, 16046, 5591...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47926,NCT04596085,2020-09-16,COMPLETED,,True,,RANDOMIZED,PARALLEL,SUPPORTIVE_CARE,This is a double blind randomized placebo cont...,...,0,2020,-1.000000e+00,-1.836970e-16,"[101, 1142, 1110, 170, 2702, 7198, 7091, 2200,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 122, 119, 1851, 1201, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47927,NCT04247802,2020-08-05,COMPLETED,,True,,RANDOMIZED,PARALLEL,TREATMENT,Backwards walking has been shown to improve ba...,...,0,2020,-8.660254e-01,-5.000000e-01,"[101, 11316, 3179, 1144, 1151, 2602, 1106, 460...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1441, 1105, 1535,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47933,NCT03811951,2018-09-12,TERMINATED,Different study initiation,True,PHASE2,RANDOMIZED,CROSSOVER,DIAGNOSTIC,The purpose of this study is to evaluate cogni...,...,1,2018,-1.000000e+00,-1.836970e-16,"[101, 1103, 3007, 1104, 1142, 2025, 1110, 1106...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1425, 1206, 1626,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Feature Selection

Features and other features for model input

In [None]:
# Feature types
id_col = 'id'
date_col = 'start_date'
numerical_cols = ['stringency_index',
                  # 'year',
                  # 'month_sin',
                  # 'month_cos'
                  ]
categorical_cols = ['phase',
                    'allocation',
                    'intervention_model',
                    'primary_purpose',
                    'dmc_oversight',
                    'fda_drug',
                    'fda_device',
                    'unapproved_device'
                    ]
target_term_col = 'terminated'
target_score_col = 'ae_score'

## Handling Nulls

For robustness, set categorical missing values to explicit null value

In [None]:
# For data consistency, set dataframe null to explicity representation of null
def fill_nan_categorical(df, categorical_cols, fill_value="null"):
    for col in categorical_cols:
        # Replace actual np.nan
        df[col] = df[col].fillna(fill_value)
        # Replace literal 'NA' string
        df[col] = df[col].replace('NA', fill_value)

In [None]:
fill_nan_categorical(df_train, categorical_cols)
fill_nan_categorical(df_val, categorical_cols)
fill_nan_categorical(df_test, categorical_cols)

## Time Series Sorting

In [None]:
# def sort_data(df):
#   ''' Sort by date, chunk sequence number '''
#   df = df.sort_values(by=['start_date', 'id', 'chunk_seq'])
#   return df

# df_train = sort_data(df_train)
# df_val = sort_data(df_val)
# df_test = sort_data(df_test)

# Display
df_train

Unnamed: 0,id,start_date,status,why_stopped,hasResults,phase,allocation,intervention_model,primary_purpose,acc_text,...,terminated,year,month_sin,month_cos,intro_input_ids,intro_attention_mask,outcomes_input_ids,outcomes_attention_mask,criteria_input_ids,criteria_attention_mask
3,NCT03696576,2018-09-20,TERMINATED,"Due to the pandemic, recruitment ended earlier...",True,,RANDOMIZED,PARALLEL,TREATMENT,The larynx and vocal folds undergo many age-re...,...,1,2018,-1.000000e+00,-1.836970e-16,"[101, 1103, 2495, 15023, 1775, 1105, 5563, 173...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1425, 2625, 1137,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,NCT02400723,2018-12-05,COMPLETED,,True,,RANDOMIZED,PARALLEL,TREATMENT,"Anxiety leads to poor quality of life, avoidan...",...,0,2018,-2.449294e-16,1.000000e+00,"[101, 10507, 4501, 1106, 2869, 3068, 1104, 129...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 11461, 4079, 2539...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15,NCT04683939,2022-01-18,TERMINATED,Sponsor decision,True,PHASE1,NON_RANDOMIZED,SEQUENTIAL,TREATMENT,"This study was planned as an open-label, multi...",...,1,2022,5.000000e-01,8.660254e-01,"[101, 1142, 2025, 1108, 2919, 1112, 1126, 1501...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2501, 10838, 9173, 131, 1111, 1155, 2192...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
16,NCT03410914,2018-08-13,COMPLETED,,True,PHASE2,,SINGLE_GROUP,PREVENTION,Despite improvements and advances in pancreas ...,...,0,2018,-8.660254e-01,-5.000000e-01,"[101, 2693, 8313, 1105, 11823, 1107, 13316, 13...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 4533, 1106, 13971...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
26,NCT04749745,2020-06-09,COMPLETED,,True,EARLY_PHASE1,RANDOMIZED,CROSSOVER,OTHER,Major depressive disorder (MDD) is a serious m...,...,0,2020,1.224647e-16,-1.000000e+00,"[101, 1558, 1260, 16568, 8936, 113, 182, 13976...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 122, 119, 4457, 117, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47924,NCT03567616,2018-10-18,TERMINATED,Following results of the primary progression-f...,True,PHASE2,NON_RANDOMIZED,SEQUENTIAL,TREATMENT,"This was an open-label, multicenter study desi...",...,1,2018,-8.660254e-01,5.000000e-01,"[101, 1142, 1108, 1126, 1501, 118, 3107, 117, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1231, 16046, 5591...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47926,NCT04596085,2020-09-16,COMPLETED,,True,,RANDOMIZED,PARALLEL,SUPPORTIVE_CARE,This is a double blind randomized placebo cont...,...,0,2020,-1.000000e+00,-1.836970e-16,"[101, 1142, 1110, 170, 2702, 7198, 7091, 2200,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 122, 119, 1851, 1201, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47927,NCT04247802,2020-08-05,COMPLETED,,True,,RANDOMIZED,PARALLEL,TREATMENT,Backwards walking has been shown to improve ba...,...,0,2020,-8.660254e-01,-5.000000e-01,"[101, 11316, 3179, 1144, 1151, 2602, 1106, 460...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1441, 1105, 1535,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47933,NCT03811951,2018-09-12,TERMINATED,Different study initiation,True,PHASE2,RANDOMIZED,CROSSOVER,DIAGNOSTIC,The purpose of this study is to evaluate cogni...,...,1,2018,-1.000000e+00,-1.836970e-16,"[101, 1103, 3007, 1104, 1142, 2025, 1110, 1106...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1425, 1206, 1626,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# Model

## Preprocess Fitting on Train
Due to time series nature, do not scale year

In [None]:
#SET
numerical_excl = ['year']

# Pipeline fit on training data
def fit_pipeline_train(df, numerical_cols, categorical_cols):
    # fit numerical scaler
    numerical_scalers = {}
    for col in numerical_cols:
        if col in numerical_excl: continue # Do not scale year
        scaler = StandardScaler()
        scaler.fit(df[[col]])  # Fit on a DataFrame with a single column
        numerical_scalers[col] = scaler

    # fit categorical mappings
    categorical_mappings = {} # Store category mappings for all categorical cols
    for col in categorical_cols:
        unique_categories = df[col].unique()
        category_mapping = {category: i+1 for i, category in enumerate(unique_categories)} # i+1 to start index from 1
        categorical_mappings[col] = category_mapping

    return numerical_scalers, categorical_mappings # Return mappings instead of encoders.

# Fit
numerical_scalers, categorical_mappings = fit_pipeline_train(df_train, numerical_cols, categorical_cols)

In [None]:
# Pipeline transform data
def pipeline_transform(df, numerical_scalers, categorical_mappings):
    ''' Iterate through the scalers and mappings to modify relevant features
    A KeyError would indicate column missing in the dataframe '''

    # Transform numerical columns (iterate through scaler keys)
    for col in numerical_scalers:
        df[col] = numerical_scalers[col].transform(df[[col]])  # Transform single column

    # Transform categorical columns (iterate through mapping keys)
    for col in categorical_mappings:
        mapping = categorical_mappings[col]
        df[col] = df[col].apply(lambda x: mapping.get(x, 0))  # 0 to handle unseen values

    return df


# Transform
df_train = pipeline_transform(df_train,
                              numerical_scalers,
                              categorical_mappings
                              )
df_val = pipeline_transform(df_val,
                            numerical_scalers,
                            categorical_mappings
                            )
df_test = pipeline_transform(df_test,
                             numerical_scalers,
                             categorical_mappings
                             )

# Display
df_train

Unnamed: 0,id,start_date,status,why_stopped,hasResults,phase,allocation,intervention_model,primary_purpose,acc_text,...,terminated,year,month_sin,month_cos,intro_input_ids,intro_attention_mask,outcomes_input_ids,outcomes_attention_mask,criteria_input_ids,criteria_attention_mask
3,NCT03696576,2018-09-20,TERMINATED,"Due to the pandemic, recruitment ended earlier...",True,1,1,1,1,The larynx and vocal folds undergo many age-re...,...,1,2018,-1.000000e+00,-1.836970e-16,"[101, 1103, 2495, 15023, 1775, 1105, 5563, 173...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1425, 2625, 1137,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,NCT02400723,2018-12-05,COMPLETED,,True,1,1,1,1,"Anxiety leads to poor quality of life, avoidan...",...,0,2018,-2.449294e-16,1.000000e+00,"[101, 10507, 4501, 1106, 2869, 3068, 1104, 129...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 11461, 4079, 2539...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15,NCT04683939,2022-01-18,TERMINATED,Sponsor decision,True,2,2,2,1,"This study was planned as an open-label, multi...",...,1,2022,5.000000e-01,8.660254e-01,"[101, 1142, 2025, 1108, 2919, 1112, 1126, 1501...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2501, 10838, 9173, 131, 1111, 1155, 2192...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
16,NCT03410914,2018-08-13,COMPLETED,,True,3,3,3,2,Despite improvements and advances in pancreas ...,...,0,2018,-8.660254e-01,-5.000000e-01,"[101, 2693, 8313, 1105, 11823, 1107, 13316, 13...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 4533, 1106, 13971...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
26,NCT04749745,2020-06-09,COMPLETED,,True,4,1,4,3,Major depressive disorder (MDD) is a serious m...,...,0,2020,1.224647e-16,-1.000000e+00,"[101, 1558, 1260, 16568, 8936, 113, 182, 13976...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 122, 119, 4457, 117, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47924,NCT03567616,2018-10-18,TERMINATED,Following results of the primary progression-f...,True,3,2,2,1,"This was an open-label, multicenter study desi...",...,1,2018,-8.660254e-01,5.000000e-01,"[101, 1142, 1108, 1126, 1501, 118, 3107, 117, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1231, 16046, 5591...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47926,NCT04596085,2020-09-16,COMPLETED,,True,1,1,1,8,This is a double blind randomized placebo cont...,...,0,2020,-1.000000e+00,-1.836970e-16,"[101, 1142, 1110, 170, 2702, 7198, 7091, 2200,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 122, 119, 1851, 1201, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47927,NCT04247802,2020-08-05,COMPLETED,,True,1,1,1,1,Backwards walking has been shown to improve ba...,...,0,2020,-8.660254e-01,-5.000000e-01,"[101, 11316, 3179, 1144, 1151, 2602, 1106, 460...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1441, 1105, 1535,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
47933,NCT03811951,2018-09-12,TERMINATED,Different study initiation,True,3,1,4,6,The purpose of this study is to evaluate cogni...,...,1,2018,-1.000000e+00,-1.836970e-16,"[101, 1103, 3007, 1104, 1142, 2025, 1110, 1106...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2425, 13950, 131, 122, 119, 4929, 131, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 10838, 9173, 131, 115, 1425, 1206, 1626,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


Save scalers and mappers

In [None]:
# # Save
# dir = '/content/drive/MyDrive/W210-Capstone-ClincalGroup/trial_risk_models'
# numerical_scaler_filename = os.path.join(dir, 'numerical_scalers.pkl')
# categorical_mapping_filename = os.path.join(dir, 'categorical_mappings.pkl')

# try:
#     with open(numerical_scaler_filename, 'wb') as f:
#         pickle.dump(numerical_scalers, f)
#     print(f"Numerical scalers saved to: {numerical_scaler_filename}")
# except Exception as e:
#     print(f"Error saving numerical scalers: {e}")

# try:
#     with open(categorical_mapping_filename, 'wb') as f:
#         pickle.dump(categorical_mappings, f)
#     print(f"Categorical mappings saved to: {categorical_mapping_filename}")
# except Exception as e:
#     print(f"Error saving categorical mappings: {e}")

## Torch Data Preparation

In [None]:
# Define PyTorch Dataset class
class TrialDataset(Dataset):
    def __init__(self, dataframe, id_col, date_col, categorical_cols, numerical_cols,
                 intro_ids_col, intro_mask_col,
                 outcomes_ids_col, outcomes_mask_col,
                 criteria_ids_col, criteria_mask_col,
                 target_term_col=None, target_score_col=None):
        self.id_col = id_col
        self.date_col = date_col
        self.categorical_cols = categorical_cols
        self.numerical_cols = numerical_cols
        self.intro_ids_col = intro_ids_col
        self.intro_mask_col = intro_mask_col
        self.outcomes_ids_col = outcomes_ids_col
        self.outcomes_mask_col = outcomes_mask_col
        self.criteria_ids_col = criteria_ids_col
        self.criteria_mask_col = criteria_mask_col
        self.target_term_col = target_term_col
        self.target_score_col = target_score_col
        #
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):  # For dataloader
        item = self.data.iloc[idx]
        trial_id = item[self.id_col]
        intro_input_ids = torch.tensor(item[self.intro_ids_col], dtype=torch.long)
        intro_attention_mask = torch.tensor(item[self.intro_mask_col], dtype=torch.long)
        outcomes_input_ids = torch.tensor(item[self.outcomes_ids_col], dtype=torch.long)
        outcomes_attention_mask = torch.tensor(item[self.outcomes_mask_col], dtype=torch.long)
        criteria_input_ids = torch.tensor(item[self.criteria_ids_col], dtype=torch.long)
        criteria_attention_mask = torch.tensor(item[self.criteria_mask_col], dtype=torch.long)
        categorical_inputs = torch.tensor([item[col] for col in self.categorical_cols], dtype=torch.long)
        numerical_inputs = torch.tensor([item[col] for col in self.numerical_cols], dtype=torch.float)

        dloader_dict =  {
            'id': trial_id,
            'intro_input_ids': intro_input_ids,
            'intro_attention_mask': intro_attention_mask,
            'outcomes_input_ids': outcomes_input_ids,
            'outcomes_attention_mask': outcomes_attention_mask,
            'criteria_input_ids': criteria_input_ids,
            'criteria_attention_mask': criteria_attention_mask,
            'categorical_inputs': categorical_inputs,
            'numerical_inputs': numerical_inputs
        }
        # Target is optional to account for new-world data
        if self.target_term_col is not None:
            target = torch.tensor(item[self.target_term_col], dtype=torch.long)
            dloader_dict['targets_term'] = target
        if self.target_score_col is not None:
            target = torch.tensor(item[self.target_score_col], dtype=torch.float)
            dloader_dict['targets_score'] = target

        return dloader_dict

In [None]:
#TEST
# Create sample PyTorch Dataset to test
trial_dataset = TrialDataset(df_train.head(100),
                             id_col='id',
                             date_col='start_date',
                             categorical_cols=categorical_cols,
                             numerical_cols=numerical_cols,
                             intro_ids_col='intro_input_ids', intro_mask_col='intro_attention_mask',
                             outcomes_ids_col='outcomes_input_ids', outcomes_mask_col='outcomes_attention_mask',
                             criteria_ids_col='criteria_input_ids', criteria_mask_col='criteria_attention_mask',
                             target_term_col=target_term_col, target_score_col=target_score_col
                             )

# Create Dataloader object to view
trial_dataloader = DataLoader(trial_dataset, batch_size=10, shuffle=False)


# Check dataloader data
for batch_idx, batch in enumerate(trial_dataloader):
    print(f"Batch {batch_idx}:")
    print("  ID:", batch['id'])
    print("  Intro Input IDs (Tensor):", batch['intro_input_ids'])
    print("  Intro Attention Mask (Tensor):", batch['intro_attention_mask'])
    print("  Outcomes Input IDs (Tensor):", batch['outcomes_input_ids'])
    print("  Outcomes Attention Mask (Tensor):", batch['outcomes_attention_mask'])
    print("  Criteria Input IDs (Tensor):", batch['criteria_input_ids'])
    print("  Cirteria Attention Mask (Tensor):", batch['criteria_attention_mask'])
    print("  Categorical Data:", batch['categorical_inputs'])
    print("  Numerical Data:", batch['numerical_inputs'])
    print("  Targets Terminated:", batch['targets_term'])
    print("  Targets Risk Score:", batch['targets_score'])
    if batch_idx == 0: # Just print the first batch for demonstration
        break

Batch 0:
  ID: ['NCT03696576', 'NCT02400723', 'NCT04683939', 'NCT03410914', 'NCT04749745', 'NCT03864341', 'NCT05129137', 'NCT03657277', 'NCT03608826', 'NCT04615403']
  Intro Input IDs (Tensor): tensor([[  101,  1103,  2495,  ...,  1490,  7606,   102],
        [  101, 10507,  4501,  ...,  3216,  3154,   102],
        [  101,  1142,  2025,  ...,  7409, 12888,   102],
        ...,
        [  101,  1103,  2025,  ...,   131,  8071,   102],
        [  101,  1103,  3007,  ...,     0,     0,     0],
        [  101,  1103,  2025,  ...,     0,     0,     0]])
  Intro Attention Mask (Tensor): tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
  Outcomes Input IDs (Tensor): tensor([[  101,  2425, 13950,  ...,  1107,  3455,   102],
        [  101,  2425, 13950,  ...,  1181,   114,   102],
        [  101,  2425, 13950,  ...,  1127,  44

Create (actual) model Dataset and Dataloader objects for model development


In [None]:
# Create (actual) model Dataset and Dataloader objects for model development
train_dataset = TrialDataset(df_train,
                             id_col='id',
                             date_col='start_date',
                             categorical_cols=categorical_cols,
                             numerical_cols=numerical_cols,
                             intro_ids_col='intro_input_ids', intro_mask_col='intro_attention_mask',
                             outcomes_ids_col='outcomes_input_ids', outcomes_mask_col='outcomes_attention_mask',
                             criteria_ids_col='criteria_input_ids', criteria_mask_col='criteria_attention_mask',
                             target_term_col=target_term_col, target_score_col=target_score_col
                             )

val_dataset = TrialDataset(df_val,
                             id_col='id',
                             date_col='start_date',
                             categorical_cols=categorical_cols,
                             numerical_cols=numerical_cols,
                             intro_ids_col='intro_input_ids', intro_mask_col='intro_attention_mask',
                             outcomes_ids_col='outcomes_input_ids', outcomes_mask_col='outcomes_attention_mask',
                             criteria_ids_col='criteria_input_ids', criteria_mask_col='criteria_attention_mask',
                             target_term_col=target_term_col, target_score_col=target_score_col
                             )

test_dataset = TrialDataset(df_test,
                             id_col='id',
                             date_col='start_date',
                             categorical_cols=categorical_cols,
                             numerical_cols=numerical_cols,
                             intro_ids_col='intro_input_ids', intro_mask_col='intro_attention_mask',
                             outcomes_ids_col='outcomes_input_ids', outcomes_mask_col='outcomes_attention_mask',
                             criteria_ids_col='criteria_input_ids', criteria_mask_col='criteria_attention_mask',
                             target_term_col=target_term_col, target_score_col=target_score_col
                             )

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)

## Create Model

In [None]:
from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F

class AE_Score_Model(nn.Module):
    def __init__(self,
                 num_categorical_features=None,
                 categorical_embedding_dims=[],
                 num_numerical_features=None,
                 embed_model=bb_model):

        super(AE_Score_Model, self).__init__()
        self.biobert = embed_model

        self.categorical_embeddings = nn.ModuleList([
            nn.Embedding(num_embeddings, embedding_dim)
            for num_embeddings, embedding_dim in categorical_embedding_dims
        ])
        self.num_categorical_features = num_categorical_features

        self.numerical_bn = nn.BatchNorm1d(num_numerical_features)  # Creates and assigns layer (a method) designed for given number of numerical features
        self.num_numerical_features = num_numerical_features

        # Combine features
        combined_input_dim = 3 * self.biobert.config.hidden_size  # 3 Outputs of BioBERT
        if num_categorical_features is not None:
            combined_input_dim += sum([dim for _, dim in categorical_embedding_dims])
        if num_numerical_features is not None:
            combined_input_dim += num_numerical_features

        # Weighting layers
        # Modality weights: 3 CLS embeddings
        self.text_modality_weights = nn.Parameter(torch.ones(3))  # shape (3,)
        # Per-feature categorical weights
        self.categorical_feature_weights = nn.Parameter(torch.ones(num_categorical_features))  # shape (num_categorical_features,)
        # Single numerical feature
        self.numerical_feature_weights = nn.Parameter(torch.ones(num_numerical_features)) # shape (num_numerical_features,)

        # Layers
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.3)
        self.linear1 = nn.Linear(combined_input_dim, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.linear2 = nn.Linear(1024, 512) # Second hidden layer
        self.bn2 = nn.BatchNorm1d(512) # BatchNorm for the second hidden layer
        self.linear3 = nn.Linear(512, 256) # Second hidden layer
        self.bn3 = nn.BatchNorm1d(256) # BatchNorm for the second hidden layer
        self.finallinear = nn.Linear(256, 1) # Output layer


    def forward(self, categorical_inputs, numerical_inputs,
                intro_input_ids, intro_attention_mask,
                outcomes_input_ids, outcomes_attention_mask,
                criteria_input_ids, criteria_attention_mask):

        # Embed each text input using BioBERT
        intro_outputs = self.biobert(intro_input_ids, attention_mask=intro_attention_mask)
        intro_embedding = intro_outputs.pooler_output

        outcomes_outputs = self.biobert(outcomes_input_ids, attention_mask=outcomes_attention_mask)
        outcomes_embedding = outcomes_outputs.pooler_output

        criteria_outputs = self.biobert(criteria_input_ids, attention_mask=criteria_attention_mask)
        criteria_embedding = criteria_outputs.pooler_output

        ## --- Concatenate ---
        # Concatenate the embeddings from the three text inputs
        # Apply modality weights (softmax optional for normalized weight distribution)
        modality_weights = F.softmax(self.text_modality_weights, dim=0)
        text_embeddings = [
            intro_embedding * modality_weights[0],
            outcomes_embedding * modality_weights[1],
            criteria_embedding * modality_weights[2]
        ]
        text_concat = torch.cat(text_embeddings, dim=1)  # (batch, 2304)

        # Categorical: embed and apply per-feature weights
        if self.num_categorical_features is not None:
          categorical_embeds = [emb(categorical_inputs[:, i]) for i, emb in enumerate(self.categorical_embeddings)]
          categorical_embeds = [embed * self.categorical_feature_weights[i] for i, embed in enumerate(categorical_embeds)]
          cat_concat = torch.cat(categorical_embeds, dim=1)  # shape (batch, sum(emb_dims))

        # Numerical
        if self.num_numerical_features is not None:
            numerical_inputs = self.numerical_bn(numerical_inputs)  # Input numerical inputs to batch normalization layer
            # numerical_feature_weights: (num_features,) → auto-broadcast to (batch, num_features)
            weighted_numerical = numerical_inputs * self.numerical_feature_weights

        # Final concat
        combined_features = torch.cat([text_concat, cat_concat, weighted_numerical], dim=1)


        # Forward
        x = self.dropout1(combined_features)
        x = F.relu(self.bn1(self.linear1(x)))
        x = self.dropout2(x)
        x = F.relu(self.bn2(self.linear2(x)))
        x = self.dropout2(x)
        x = F.relu(self.bn3(self.linear3(x)))
        x = self.dropout2(x)
        logits = self.finallinear(x)
        return logits.squeeze(-1)

In [None]:
# Define categorical embeddings based on training data
categorical_embedding_dims = []
for col in categorical_cols:
    num_unique_values = len(df_train[col].unique()) + 1 # +1 to account for unseen
    embedding_size = min(20, (num_unique_values + 1) // 2) # A common heuristic
    categorical_embedding_dims.append((num_unique_values, embedding_size))
    print(f"Column: {col}, Unique Values: {num_unique_values}, Embedding Size: {embedding_size}")

Column: phase, Unique Values: 7, Embedding Size: 4
Column: allocation, Unique Values: 4, Embedding Size: 2
Column: intervention_model, Unique Values: 6, Embedding Size: 3
Column: primary_purpose, Unique Values: 10, Embedding Size: 5
Column: dmc_oversight, Unique Values: 4, Embedding Size: 2
Column: fda_drug, Unique Values: 4, Embedding Size: 2
Column: fda_device, Unique Values: 4, Embedding Size: 2
Column: unapproved_device, Unique Values: 3, Embedding Size: 2


In [None]:
# Instantiate the model
model = AE_Score_Model(num_categorical_features=len(categorical_cols),
                  categorical_embedding_dims=categorical_embedding_dims,
                  num_numerical_features=len(numerical_cols))

display(model)

AE_Score_Model(
  (biobert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Train

In [None]:
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import r2_score

# Define loss function. Add class weighting.
criterion = nn.MSELoss()

# Define optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5) # Adjust learning rate as needed
# Scheduler
# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

# Set the number of training epochs
num_epochs = 10

# Move model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# early stopping variables to prevent overfitting
best_val_loss = float('inf')
best_model_state = None
patience = 3
patience_counter = 0

# Training loop
for epoch in range(num_epochs):

    # Training
    model.train() # Set the model to training mode
    total_loss = 0
    for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        intro_input_ids_batch = batch['intro_input_ids'].to(device)
        intro_attention_mask_batch = batch['intro_attention_mask'].to(device)
        outcomes_input_ids_batch = batch['outcomes_input_ids'].to(device)
        outcomes_attention_mask_batch = batch['outcomes_attention_mask'].to(device)
        criteria_input_ids_batch = batch['criteria_input_ids'].to(device)
        criteria_attention_mask_batch = batch['criteria_attention_mask'].to(device)
        categorical_batch = batch['categorical_inputs'].to(device)
        numerical_batch = batch['numerical_inputs'].to(device)
        targets_batch = batch['targets_score'].to(device)

        # Zero the gradients
        optimizer.zero_grad()
        # Forward pass
        outputs = model(categorical_batch, numerical_batch,
                        intro_input_ids_batch, intro_attention_mask_batch,
                        outcomes_input_ids_batch, outcomes_attention_mask_batch,
                        criteria_input_ids_batch, criteria_attention_mask_batch)
        # Calculate loss
        loss = criterion(outputs, targets_batch)
        total_loss += loss.item()
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader) # Loss calculated as average batch loss
    print(f"Epoch {epoch+1} completed, Average Loss: {avg_loss:.4f}")

    # Validation
    model.eval() # Set the model to evaluation mode
    total_val_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad(): # Disable gradient calculations during validation
        for batch_idx, batch in enumerate(tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} (Val)")):
            intro_input_ids_batch = batch['intro_input_ids'].to(device)
            intro_attention_mask_batch = batch['intro_attention_mask'].to(device)
            outcomes_input_ids_batch = batch['outcomes_input_ids'].to(device)
            outcomes_attention_mask_batch = batch['outcomes_attention_mask'].to(device)
            criteria_input_ids_batch = batch['criteria_input_ids'].to(device)
            criteria_attention_mask_batch = batch['criteria_attention_mask'].to(device)
            categorical_batch = batch['categorical_inputs'].to(device)
            numerical_batch = batch['numerical_inputs'].to(device)
            targets_batch = batch['targets_score'].to(device)

            # Forward pass of validation data
            outputs = model(categorical_batch, numerical_batch,
                        intro_input_ids_batch, intro_attention_mask_batch,
                        outcomes_input_ids_batch, outcomes_attention_mask_batch,
                        criteria_input_ids_batch, criteria_attention_mask_batch)
            # Calculate loss
            loss = criterion(outputs, targets_batch)
            total_val_loss += loss.item()

            # Get predictions (assuming binary classification with logits)
            preds = outputs.cpu().numpy()
            labels = targets_batch.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    avg_val_loss = total_val_loss / len(val_dataloader)
    r2 = r2_score(all_labels, all_preds)

    print(f"Validation Loss: {avg_val_loss:.4f} | R² Score: {r2:.4f}")

    # Step the scheduler based on the validation loss
    # scheduler.step(avg_val_loss)

    # --- Early Stopping Check ---
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        best_model_state = model.state_dict()  # Save best model
        print("Validation loss improved. Saving model...")
    else:
        counter += 1
        print(f"No improvement in validation loss. Patience: {counter}/{patience}")

        if counter >= patience:
            print("Early stopping triggered.")
            break

#load the best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"Loaded best model with loss score: {best_val_loss:.4f}")

print("Training finished!")

Output should be clamped to [0, 1.33] and then divided by 1.33. Investigate extremes on the upper end.

# Save Model

In [None]:
import joblib

# save the file into pkl file
joblib.dump(model, dir + "/model_ae_score.pkl")

In [None]:
PATH = dir + '/model_ae_score_state_dict.pth'
torch.save(model.state_dict(), PATH)

print(f"Model state dictionary saved to {PATH}")