In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import metrics  
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
from tensorflow.keras.optimizers import Adam, RMSprop, AdamW

In [None]:
# Load the data
# Load embeddings and labels
embeddings_1 = np.load('embeddings_1.npy')
embeddings_2 = np.load('embeddings_2.npy')
embeddings = np.vstack([embeddings_1, embeddings_2])  # Combine both embedding files

In [3]:
# Load labels and convert them to multi-hot encoding
with open('icd_codes_1.txt') as f1, open('icd_codes_2.txt') as f2:
    labels_1 = [line.strip().split(';') for line in f1]
    labels_2 = [line.strip().split(';') for line in f2]
    labels = labels_1 + labels_2

In [4]:
# Create a mapping for ICD10 codes to multi-hot encoding
unique_codes = sorted(set(code for sublist in labels for code in sublist))
code_to_index = {code: idx for idx, code in enumerate(unique_codes)}
num_classes = len(unique_codes)

In [5]:
# Convert labels to multi-hot vectors
y = np.zeros((len(labels), num_classes), dtype=int)
for i, label_list in enumerate(labels):
    for code in label_list:
        y[i, code_to_index[code]] = 1

In [6]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(embeddings, y, test_size=0.2, random_state=42)

In [None]:
# Define the model
model = Sequential([
    Dense(512, activation='relu', input_shape=(1024,)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='sigmoid')  # Sigmoid for multi-label classification
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy',metrics.Precision(),metrics.Recall()])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# Step 3: Train the model
model.fit(X_train, y_train, epochs=3, batch_size=128, validation_data=(X_val, y_val))

Epoch 1/3
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.1308 - loss: 0.0347 - precision: 0.0501 - recall: 0.1160 - val_accuracy: 0.4943 - val_loss: 0.0030 - val_precision: 0.7978 - val_recall: 0.5311
Epoch 2/3
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 19ms/step - accuracy: 0.4626 - loss: 0.0036 - precision: 0.7832 - recall: 0.4308 - val_accuracy: 0.5493 - val_loss: 0.0024 - val_precision: 0.8208 - val_recall: 0.6372
Epoch 3/3
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 24ms/step - accuracy: 0.5129 - loss: 0.0030 - precision: 0.8055 - recall: 0.5177 - val_accuracy: 0.5586 - val_loss: 0.0022 - val_precision: 0.8307 - val_recall: 0.6608


<keras.src.callbacks.history.History at 0x268d3e85810>

In [None]:
# Load the test embeddings
test_embeddings = np.load('test_data.npy')

In [None]:
# Make predictions on the test data
# Load the model (assuming you've already trained and saved it if needed)
y_test_pred = model.predict(test_embeddings)

[1m3110/3110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step


In [11]:
for i in range(10000):
    if y_test_pred[i][938]>=0.42:
        print('yes')

In [None]:
# Evaluate the model
# Predict on the validation set and compute micro F2 score
y_val_pred = model.predict(X_val) > 0.43 # Convert probabilities to binary predictions
micro_f2_score = f1_score(y_val, y_val_pred, average='micro')

print(f'Micro F2 Score on validation set: {micro_f2_score:.4f}')


[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Micro F2 Score on validation set: 0.7397


In [None]:
# Convert predictions to ICD10 code labels
# Apply threshold to determine which codes to include (e.g., 0.5 threshold)
threshold = 0.42
test_labels_pred = (y_test_pred > threshold).astype(int)

In [15]:
test_labels_pred.shape

(99490, 1400)

In [None]:
# Create a submission file in the specified format
# Map indices back to ICD10 codes
index_to_code = {v: k for k, v in code_to_index.items()}

submission_data = []
for idx, label_vector in enumerate(test_labels_pred, start=1):
    # Get codes with predictions above the threshold and sort lexicographically
    codes = [index_to_code[i] for i, val in enumerate(label_vector) if val == 1]
    codes = sorted(codes)  # Sort lexicographically
    label_string = ';'.join(codes) if codes else ''  # Stitch with ';' or leave blank if no label
    submission_data.append({'id': idx, 'labels': label_string})

In [21]:
# Convert to DataFrame and save as CSV
submission_df = pd.DataFrame(submission_data)
submission_df

Unnamed: 0,id,labels
0,1,G56.21
1,2,M65.9;S83.242A
2,3,G56.01
3,4,M65.312
4,5,S83.241A;S83.281A
...,...,...
99485,99486,D12.0;K57.30;K63.5
99486,99487,K31.89
99487,99488,D12.2;D12.5;K64.8;Z12.11
99488,99489,K21.9;K29.50


In [22]:
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
