In [1]:
import numpy as np
import os
import csv
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.callbacks import Callback

In [2]:
# Load and concatenate embeddings and labels for the full training set
embeddings_file_1 = '../data/embeddings_1.npy'
labels_file_1 = '../data/icd_codes_1.txt'
embeddings_file_2 = '../data/embeddings_2.npy'
labels_file_2 = '../data/icd_codes_2.txt'

train_embeddings_1 = np.load(embeddings_file_1)
train_embeddings_2 = np.load(embeddings_file_2)
train_embeddings = np.concatenate([train_embeddings_1, train_embeddings_2], axis=0)

# Load and concatenate labels for the full training set
with open(labels_file_1, 'r') as f:
    train_labels_1 = [line.strip().split(';') for line in f]
with open(labels_file_2, 'r') as f:
    train_labels_2 = [line.strip().split(';') for line in f]
train_labels = train_labels_1 + train_labels_2

# Multi-hot encode labels
mlb = MultiLabelBinarizer()
train_multi_hot_labels = mlb.fit_transform(train_labels)

In [None]:
label_dict = {label: [] for label in mlb.classes_}

# Convert train_embeddings to a boolean array
train_embeddings_bool = train_multi_hot_labels.astype(bool)

# Iterate over each label and find the indices where the value is 1
for j, label in enumerate(mlb.classes_):
    indices = np.where(train_embeddings_bool[:, j])[0]
    label_dict[label] = indices.tolist()


{'A63.0': [21913, 23308, 23379, 25545, 30200, 30507, 34264, 37317, 38190, 38350, 39850, 40072, 40163, 40312, 40573, 41183, 43811, 44126, 44670, 46335, 47488, 48381, 50824, 52606, 52909, 53270, 54268, 55520, 55762, 56082, 57010, 57015, 57351, 57811, 58055, 58598, 58614, 58989, 59035, 59519, 59897, 60255, 60307, 60845, 61190, 62574, 63286, 63884, 63975, 64362, 64714, 64741, 65909, 67985, 68367, 68799, 69831, 71922, 74685, 74722, 75662, 76177, 76552, 77399, 77578, 79439, 79479, 80235, 80255, 80590, 81589, 81940, 82799, 83602, 84299, 84323, 84384, 84546, 84595, 84745, 85138, 86106, 86981, 88353, 88623, 89246, 89877, 91059, 91410, 91601, 91888, 92201, 92756, 92855, 94033, 94211, 94601, 94894, 95188, 95444, 95760, 95890, 95933, 96324, 96632, 96798, 97353, 99108, 99366, 99837, 100466, 100707, 103570, 103885, 103993, 104461, 104810, 105219, 105353, 105420, 105507, 106937, 107302, 109514, 109519, 112552, 113150, 113172, 114011, 114289, 114325, 114930, 118028, 118056, 118723, 119854, 120161, 120

In [None]:
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.callbacks import Callback
import xgboost as xgb

# Parameters
EMBEDDING_DIM = 1024          # Embedding dimensions given in data
EPOCHS = 40                   # Number of epochs to train for
BATCH_SIZE = 1024             # Batch size for training

# Load and concatenate embeddings and labels for the full training set
embeddings_file_1 = '../data/embeddings_1.npy'
labels_file_1 = '../data/icd_codes_1.txt'
embeddings_file_2 = '../data/embeddings_2.npy'
labels_file_2 = '../data/icd_codes_2.txt'

train_embeddings_1 = np.load(embeddings_file_1)
train_embeddings_2 = np.load(embeddings_file_2)
train_embeddings = np.concatenate([train_embeddings_1, train_embeddings_2], axis=0)

# Load and concatenate labels for the full training set
with open(labels_file_1, 'r') as f:
    train_labels_1 = [line.strip().split(';') for line in f]
with open(labels_file_2, 'r') as f:
    train_labels_2 = [line.strip().split(';') for line in f]
train_labels = train_labels_1 + train_labels_2

# Multi-hot encode labels
mlb = MultiLabelBinarizer()
train_multi_hot_labels = mlb.fit_transform(train_labels)

# Split the training data for validation
train_size = int(0.8 * len(train_embeddings))
x_train, x_val = train_embeddings[:train_size], train_embeddings[train_size:]
y_train, y_val = train_multi_hot_labels[:train_size], train_multi_hot_labels[train_size:]

# Train an XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, max_depth=5, learning_rate=0.1)
xgb_model.fit(x_train, y_train)

# Generate new features using the XGBoost model
x_train_xgb = xgb_model.apply(x_train)
x_val_xgb = xgb_model.apply(x_val)

# Define the neural network model
def create_model(input_dim, hidden_layer_size=512, activation='relu', learning_rate=0.001, dropout_rate=0.3):
    model = models.Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    model.add(layers.Dense(hidden_layer_size, activation=activation))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(len(mlb.classes_), activation='sigmoid'))  # One neuron per label with sigmoid activation
    optimizer = optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['binary_accuracy'])
    return model

# Define F2 Micro Score Callback
class F2MicroScore(Callback):
    def __init__(self, x_val, y_val):
        super(F2MicroScore, self).__init__()
        self.x_val = x_val
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        val_pred = (self.model.predict(self.x_val) > 0.5).astype(int)  # Binarize predictions
        f2 = fbeta_score(self.y_val, val_pred, beta=2, average='micro')  # Calculate F2 micro score
        print(f"Epoch {epoch + 1}: F2 Micro Score: {f2:.4f}")  # Print F2 score

# Train the neural network using the new features
input_dim = x_train_xgb.shape[1]
model = create_model(input_dim=input_dim)
model.fit(x_train_xgb, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, 
          validation_data=(x_val_xgb, y_val), verbose=1, callbacks=[F2MicroScore(x_val_xgb, y_val)])

# Print the final model's performance on the validation set
val_pred = (model.predict(x_val_xgb) > 0.5).astype(int)
final_f2 = fbeta_score(y_val, val_pred, beta=2, average='micro')
print(f"Final F2 Micro Score: {final_f2:.4f}")