In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
# Step 1: Read and Parse ICD10 Codes from Multiple Files
unique_icd10_codes = set()

# List of files to read
icd_code_files = ['icd_codes_1.txt', 'icd_codes_2.txt']

# Read each file and collect unique ICD10 codes
for filename in icd_code_files:
    with open(filename, 'r') as f:
        for line in f:
            codes = line.strip().split(';')  # Split codes by semicolon
            unique_icd10_codes.update(codes)  # Add each code to the set

# Convert to a sorted list of unique ICD10 codes
icd10_codes = sorted(list(unique_icd10_codes))

In [3]:
# Step 2: Generate Hierarchical Vectors for Each Code
# Split ICD10 codes into hierarchical levels
primary_categories = [code[0] for code in icd10_codes]      # First character
subcategories = [code[1:3] for code in icd10_codes]         # Next two characters
suffixes = [code[3:] if len(code) > 3 else '' for code in icd10_codes]  # Remaining part, if any

# Combine hierarchical levels into a single array
hierarchy_data = np.array([primary_categories, subcategories, suffixes]).T

# Encode hierarchy levels into one-hot vectors
encoder = OneHotEncoder(sparse_output=False)
hierarchical_vectors = encoder.fit_transform(hierarchy_data)

# Step 3: Apply PCA to Reduce Dimensions for Label Embeddings
pca = PCA(n_components=5)
label_embeddings = pca.fit_transform(hierarchical_vectors)

# Map each ICD10 code to its embedding
code_to_embedding = {code: emb for code, emb in zip(icd10_codes, label_embeddings)}

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Load Embedding Data
X = np.concatenate([
    np.load('embeddings_1.npy'),  # shape should be (samples_1, 1024)
    np.load('embeddings_2.npy')   # shape should be (samples_2, 1024)
])

# Load and Process Labels
# Read the ICD10 labels from each file
icd_labels = []
for filename in ['icd_codes_1.txt', 'icd_codes_2.txt']:
    with open(filename, 'r') as f:
        for line in f:
            codes = line.strip().split(';')
            icd_labels.append(codes)

#Convert Labels to Multi-Hot Encoding
# Use MultiLabelBinarizer to encode ICD10 codes
mlb = MultiLabelBinarizer(classes=icd10_codes)  
y = mlb.fit_transform(icd_labels)  

In [5]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
from sklearn.utils import resample
import numpy as np

# Concatenate X and y along the feature axis to keep X and y together
Xy_train = np.hstack((X_train, y_train))

# Define the minimum number of samples for each label
min_samples = 10

for code_index in range(y_train.shape[1]):
    # Get samples where this specific label (ICD10 code) is present
    minority_samples = Xy_train[Xy_train[:, X_train.shape[1] + code_index] == 1]
    
    # Check if there are any minority samples and if oversampling is necessary
    if len(minority_samples) > 0 and len(minority_samples) < min_samples:
        num_to_add = min_samples - len(minority_samples)
        
        # Perform oversampling with replacement
        oversampled_samples = resample(
            minority_samples,
            replace=True,
            n_samples=num_to_add,
            random_state=42
        )
        
        # Stack the oversampled samples back into the dataset
        Xy_train = np.vstack([Xy_train, oversampled_samples])

# Split X and y back into separate variables
X_train_resampled = Xy_train[:, :X_train.shape[1]]
y_train_resampled = Xy_train[:, X_train.shape[1]:]


In [32]:
# Define the custom embedding loss function
def custom_embedding_loss(y_true, y_pred):
    # Cast y_true and y_pred to compatible dtypes
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    
    # For each code in the label embedding, calculate the distance loss
    losses = tf.reduce_sum(tf.square(y_true - y_pred), axis=-1)
    
    # Take the mean loss across the batch
    return tf.reduce_mean(losses)

In [None]:
# Step 5: Define the Three-Layer Model
def create_model(input_dim, output_dim, learning_rate=0.0005):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(512, activation='relu'),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(output_dim, activation='sigmoid')
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=custom_embedding_loss, metrics=['accuracy'])
    return model


input_dim = 1024  
output_dim = len(icd10_codes)  

model = create_model(input_dim, output_dim)

In [34]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

In [None]:
# Train the model (ensure X_train and y_train are defined)
history = model.fit(
    X_train_resampled, y_train_resampled,
    epochs=40,
    batch_size=128,
    validation_data=(X_val, y_val), 
    callbacks=[reduce_lr]
)

Epoch 1/5
[1m1252/1252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.5952 - loss: 0.3477 - val_accuracy: 0.5682 - val_loss: 0.5939 - learning_rate: 1.0000e-06
Epoch 2/5
[1m1252/1252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - accuracy: 0.5943 - loss: 0.3475 - val_accuracy: 0.5682 - val_loss: 0.5939 - learning_rate: 1.0000e-06
Epoch 3/5
[1m1252/1252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.5921 - loss: 0.3512 - val_accuracy: 0.5683 - val_loss: 0.5940 - learning_rate: 1.0000e-06
Epoch 4/5
[1m1252/1252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.5941 - loss: 0.3471 - val_accuracy: 0.5682 - val_loss: 0.5940 - learning_rate: 1.0000e-06
Epoch 5/5
[1m1252/1252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.5939 - loss: 0.3490 - val_accuracy: 0.5681 - val_loss: 0.5940 - learning_rate: 1.0000e-06


In [None]:
# Load the test embeddings
test_embeddings = np.load('test_data.npy')

In [None]:
# Make predictions on the test data
y_test_pred = model.predict(test_embeddings)

[1m3110/3110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step


In [47]:
for i in range(10000):
    if y_test_pred[i][1]>=0.4:
        print('yes')

yes


In [None]:
# Evaluate the model
# Predict on the validation set and compute micro F2 score
y_val_pred = model.predict(X_val) > 0.45 # Convert probabilities to binary predictions
micro_f2_score = f1_score(y_val, y_val_pred, average='micro')

print(f'Micro F2 Score on validation set: {micro_f2_score:.4f}')

[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Micro F2 Score on validation set: 0.8176


In [53]:
threshold = 0.45
test_labels_pred = (y_test_pred > threshold).astype(int)

In [54]:
# Define code_to_index and index_to_code mappings
code_to_index = {code: idx for idx, code in enumerate(icd10_codes)}
index_to_code = {idx: code for code, idx in code_to_index.items()}

In [None]:
# Create a submission file in the specified format
# Map indices back to ICD10 codes
index_to_code = {v: k for k, v in code_to_index.items()}

submission_data = []
for idx, label_vector in enumerate(test_labels_pred, start=1):
    # Get codes with predictions above the threshold and sort lexicographically
    codes = [index_to_code[i] for i, val in enumerate(label_vector) if val == 1]
    codes = sorted(codes)  # Sort lexicographically
    label_string = ';'.join(codes) if codes else ''  # Stitch with ';' or leave blank if no label
    submission_data.append({'id': idx, 'labels': label_string})

In [56]:
# Convert to DataFrame and save as CSV
submission_df = pd.DataFrame(submission_data)
submission_df

Unnamed: 0,id,labels
0,1,G56.21
1,2,M65.9;S83.242A
2,3,G56.01
3,4,M65.312
4,5,S83.241A;S83.281A
...,...,...
99485,99486,D12.0;D12.5;K57.30;K63.5;K64.9
99486,99487,K31.89;K90.0
99487,99488,D12.2;D12.5;K64.8;Z12.11
99488,99489,B96.81;K21.9;K29.50


In [None]:
submission_df.to_csv('hei_submission.csv', index=False)

print("Submission file 'hei_submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
