In [None]:
import numpy as np
import os
import csv
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.callbacks import Callback

# Parameters
EMBEDDING_DIM = 1024          # Embedding dimensions given in data
EPOCHS = 15

# Load and concatenate embeddings and labels for the full training set
embeddings_file_1 = '../data/embeddings_1.npy'
labels_file_1 = '../data/icd_codes_1.txt'
embeddings_file_2 = '../data/embeddings_2.npy'
labels_file_2 = '../data/icd_codes_2.txt'

train_embeddings_1 = np.load(embeddings_file_1)
train_embeddings_2 = np.load(embeddings_file_2)
train_embeddings = np.concatenate([train_embeddings_1, train_embeddings_2], axis=0)

# Load and concatenate labels for the full training set
with open(labels_file_1, 'r') as f:
    train_labels_1 = [line.strip().split(';') for line in f]
with open(labels_file_2, 'r') as f:
    train_labels_2 = [line.strip().split(';') for line in f]
train_labels = train_labels_1 + train_labels_2

# Multi-hot encode labels
mlb = MultiLabelBinarizer()
train_multi_hot_labels = mlb.fit_transform(train_labels)

# Define F2 Micro Score Callback
class F2MicroScore(Callback):
    def __init__(self, x_val, y_val):
        super(F2MicroScore, self).__init__()
        self.x_val = x_val
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        val_pred = (self.model.predict(self.x_val) > 0.5).astype(int)  # Binarize predictions
        f2 = fbeta_score(self.y_val, val_pred, beta=2, average='micro')  # Calculate F2 micro score
        print(f"Epoch {epoch + 1}: F2 Micro Score: {f2:.4f}")  # Print F2 score

# Split the training data for validation
train_size = int(0.8 * len(train_embeddings))
x_train, x_val = train_embeddings[:train_size], train_embeddings[train_size:]
y_train, y_val = train_multi_hot_labels[:train_size], train_multi_hot_labels[train_size:]

# Define the neural network model
def create_model(hidden_layer_sizes=[1024, 512], dropout_rate=0.3):
    model = models.Sequential()
    model.add(layers.Input(shape=(EMBEDDING_DIM,)))
    for size in hidden_layer_sizes:
        model.add(layers.Dense(size, activation='relu'))
        model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(len(mlb.classes_), activation='sigmoid'))  # One neuron per label with sigmoid activation
    optimizer = optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['binary_accuracy'])
    return model

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [[2048, 1024], [1024, 512], [512, 256], [256, 128]],
    'batch_size': [512, 1024, 2048]
}

# Perform grid search manually
best_f2_score = 0
best_params = None
best_model = None

for hidden_layer_sizes in param_grid['hidden_layer_sizes']:
    for batch_size in param_grid['batch_size']:
        print(f"Training with hidden_layer_sizes={hidden_layer_sizes} and batch_size={batch_size}")
        model = create_model(hidden_layer_sizes=hidden_layer_sizes)
        model.fit(x_train, y_train, epochs=EPOCHS, batch_size=batch_size, 
                  validation_split=0.2, verbose=1, callbacks=[F2MicroScore(x_val, y_val)])
        val_pred = (model.predict(x_val) > 0.5).astype(int)
        f2 = fbeta_score(y_val, val_pred, beta=2, average='micro')
        print(f"F2 Micro Score: {f2:.4f}")
        if f2 > best_f2_score:
            best_f2_score = f2
            best_params = {'hidden_layer_sizes': hidden_layer_sizes, 'batch_size': batch_size}
            best_model = model

print(f"Best Parameters: {best_params}")
print(f"Best F2 Micro Score: {best_f2_score:.4f}")

# Load test data and make predictions
test_embeddings = np.load('../data/test_data.npy')
test_predictions = best_model.predict(test_embeddings)

# Convert predictions to label format as shown in the image
test_pred_labels = [
    ';'.join(sorted([mlb.classes_[i] for i, prob in enumerate(pred) if prob > 0.5]))
    for pred in test_predictions
]

# Specify path to Downloads folder
downloads_path = '../data/test_predictions19.csv'

# Write predictions to a CSV file in the specified format
with open(downloads_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'labels'])  # Write header
    for idx, labels in enumerate(test_pred_labels, start=1):
        writer.writerow([idx, labels])

print("Predictions saved to 'test_predictions19.csv'")
print(f"Predictions saved to '{downloads_path}'")

Training with hidden_layer_sizes=[2048, 1024] and batch_size=512
Epoch 1/15
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step
Epoch 1: F2 Micro Score: 0.3288
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 124ms/step - binary_accuracy: 0.9807 - loss: 0.0535 - val_binary_accuracy: 0.9990 - val_loss: 0.0044
Epoch 2/15
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step
Epoch 2: F2 Micro Score: 0.5499
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 114ms/step - binary_accuracy: 0.9990 - loss: 0.0035 - val_binary_accuracy: 0.9992 - val_loss: 0.0027
Epoch 3/15
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step
Epoch 3: F2 Micro Score: 0.6093
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 113ms/step - binary_accuracy: 0.9993 - loss: 0.0024 - val_binary_accuracy: 0.9993 - val_loss: 0.0023
Epoch 4/15
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1