In [6]:
import numpy as np
import os
import csv
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.callbacks import Callback

# Parameters
EMBEDDING_DIM = 1024          # Embedding dimensions given in data
EPOCHS = 35                   # Number of epochs to train for

# Load and concatenate embeddings and labels for the full training set
embeddings_file_1 = '../data/embeddings_1.npy'
labels_file_1 = '../data/icd_codes_1.txt'
embeddings_file_2 = '../data/embeddings_2.npy'
labels_file_2 = '../data/icd_codes_2.txt'

train_embeddings_1 = np.load(embeddings_file_1)
train_embeddings_2 = np.load(embeddings_file_2)
train_embeddings = np.concatenate([train_embeddings_1, train_embeddings_2], axis=0)

# Load and concatenate labels for the full training set
with open(labels_file_1, 'r') as f:
    train_labels_1 = [line.strip().split(';') for line in f]
with open(labels_file_2, 'r') as f:
    train_labels_2 = [line.strip().split(';') for line in f]
train_labels = train_labels_1 + train_labels_2

# Multi-hot encode labels
mlb = MultiLabelBinarizer()
train_multi_hot_labels = mlb.fit_transform(train_labels)

# Define F2 Micro Score Callback
class F2MicroScore(Callback):
    def __init__(self, x_val, y_val):
        super(F2MicroScore, self).__init__()
        self.x_val = x_val
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        val_pred = (self.model.predict(self.x_val) > 0.5).astype(int)  # Binarize predictions
        f2 = fbeta_score(self.y_val, val_pred, beta=2, average='micro')  # Calculate F2 micro score
        print(f"Epoch {epoch + 1}: F2 Micro Score: {f2:.4f}")  # Print F2 score

# Split the training data for validation
train_size = int(0.8 * len(train_embeddings))
x_train, x_val = train_embeddings[:train_size], train_embeddings[train_size:]
y_train, y_val = train_multi_hot_labels[:train_size], train_multi_hot_labels[train_size:]

# Define the neural network model
def create_model(hidden_layer_size=512, activation='relu', learning_rate=0.001, dropout_rate=0.3):
    model = models.Sequential()
    model.add(layers.Input(shape=(EMBEDDING_DIM,)))
    model.add(layers.Dense(hidden_layer_size, activation=activation))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(len(mlb.classes_), activation='sigmoid'))  # One neuron per label with sigmoid activation
    optimizer = optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['binary_accuracy'])
    return model

# Define the parameter grid
param_grid = {
    'hidden_layer_size': [512],
    'batch_size': [1024],
    'learning_rate': [0.001],
    'activation': ['relu'],
    'dropout_rate': [0.3]
}

# Perform grid search manually
best_f2_score = 0
best_params = None
best_model = None

for hidden_layer_size in param_grid['hidden_layer_size']:
    for batch_size in param_grid['batch_size']:
        for learning_rate in param_grid['learning_rate']:
            for activation in param_grid['activation']:
                for dropout_rate in param_grid['dropout_rate']:
                    print(f"Training with hidden_layer_size={hidden_layer_size}, batch_size={batch_size}, learning_rate={learning_rate}, activation={activation}, dropout_rate={dropout_rate}")
                    model = create_model(hidden_layer_size=hidden_layer_size, activation=activation, learning_rate=learning_rate, dropout_rate=dropout_rate)
                    model.fit(x_train, y_train, epochs=EPOCHS, batch_size=batch_size, 
                              validation_split=0.2, verbose=1, callbacks=[F2MicroScore(x_val, y_val)])
                    val_pred = (model.predict(x_val) > 0.5).astype(int)
                    f2 = fbeta_score(y_val, val_pred, beta=2, average='micro')
                    print(f"F2 Micro Score: {f2:.4f}")
                    if f2 > best_f2_score:
                        best_f2_score = f2
                        best_params = {'hidden_layer_size': hidden_layer_size, 'batch_size': batch_size, 'learning_rate': learning_rate, 'activation': activation, 'dropout_rate': dropout_rate}
                        best_model = model

print(f"Best Parameters: {best_params}")
print(f"Best F2 Micro Score: {best_f2_score:.4f}")

Training with hidden_layer_size=512, batch_size=1024, learning_rate=0.001, activation=relu, dropout_rate=0.3
Epoch 1/35
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Epoch 1: F2 Micro Score: 0.0607
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 99ms/step - binary_accuracy: 0.9507 - loss: 0.1253 - val_binary_accuracy: 0.9987 - val_loss: 0.0077
Epoch 2/35
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 932us/step
Epoch 2: F2 Micro Score: 0.3163
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 83ms/step - binary_accuracy: 0.9988 - loss: 0.0063 - val_binary_accuracy: 0.9989 - val_loss: 0.0050
Epoch 3/35
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 943us/step
Epoch 3: F2 Micro Score: 0.4066
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 84ms/step - binary_accuracy: 0.9990 - loss: 0.0043 - val_binary_accuracy: 0.9990 - val_loss: 0.0040
Epoch 4/35
[1m1244/1244[

In [None]:
# Load test data and make predictions
# Load test data and apply PCA
test_embeddings = np.load('../data/test_data.npy')

# Make predictions
test_predictions = model.predict(test_embeddings, verbose=1)

# Convert predictions to label format
test_pred_labels = [
    ';'.join(sorted([mlb.classes_[i] for i, prob in enumerate(pred) if prob > 0.5]))
    for pred in test_predictions
]

from datetime import datetime

# Specify path to save predictions with a unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = os.path.join("../data", f"test_predictions_{timestamp}.csv")

# Write predictions to a CSV file in the specified format
with open(save_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'labels'])  # Write header
    for idx, labels in enumerate(test_pred_labels, start=1):
        writer.writerow([idx, labels])

print(f"Predictions saved to '{save_path}'")

[1m3110/3110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step
Predictions saved to '../data\test_predictions_20241105_092037.csv'
