In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
import os



In [None]:
# Load the data
# Load embeddings and labels
embeddings_1 = np.load('embeddings_1.npy')
embeddings_2 = np.load('embeddings_2.npy')
embeddings = np.vstack([embeddings_1, embeddings_2])  # Combine both embedding files

In [3]:
# Load labels and convert them to multi-hot encoding
with open('icd_codes_1.txt') as f1, open('icd_codes_2.txt') as f2:
    labels_1 = [line.strip().split(';') for line in f1]
    labels_2 = [line.strip().split(';') for line in f2]
    labels = labels_1 + labels_2

In [4]:
# Create a mapping for ICD10 codes to multi-hot encoding
unique_codes = sorted(set(code for sublist in labels for code in sublist))
code_to_index = {code: idx for idx, code in enumerate(unique_codes)}
num_classes = len(unique_codes)

In [5]:
# Convert labels to multi-hot vectors
label_encoded = np.zeros((len(labels), num_classes), dtype=int)
for i, label_list in enumerate(labels):
    for code in label_list:
        label_encoded[i, code_to_index[code]] = 1

In [71]:
# Custom Dataset class
class HealthRecordDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.FloatTensor(embeddings)
        self.labels = torch.FloatTensor(labels)
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Neural Network Architecture
class MultiLabelClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate=0.15):
        super(MultiLabelClassifier, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        # Create hidden layers
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)



In [21]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, patience=8):
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        train_batches = 0
        
        for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
            train_batches += 1
        
        avg_train_loss = total_train_loss / train_batches
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        total_val_loss = 0
        val_batches = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                total_val_loss += loss.item()
                val_batches += 1
        
        avg_val_loss = total_val_loss / val_batches
        val_losses.append(avg_val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Training Loss: {avg_train_loss:.4f}')
        print(f'Validation Loss: {avg_val_loss:.4f}')
        
        # Early stopping check
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            # Save best model
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
            }, 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping triggered after {epoch+1} epochs')
                break
    
    return train_losses, val_losses



In [None]:
# Main execution
def main(embeddings, label_encoded):
    
    # Set random seed for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(
        embeddings, label_encoded, test_size=0.13, random_state=101
    )
    
    # Create datasets
    train_dataset = HealthRecordDataset(X_train, y_train)
    val_dataset = HealthRecordDataset(X_val, y_val)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
    
    # Initialize model
    input_dim = embeddings.shape[1]
    hidden_dims = [512, 256, 128]  
    output_dim = label_encoded.shape[1]  
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MultiLabelClassifier(input_dim, hidden_dims, output_dim).to(device)
    
    # Define loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
    # Train the model
    train_losses, val_losses = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        device=device,
        num_epochs=50,  # Adjust as needed
        patience=5
    )
    
    # Plot training history
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Training History')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('training_history.png')
    plt.close()



In [None]:
# Function to make predictions
def predict(model, input_data, device):
    model.eval()
    with torch.no_grad():
        input_tensor = torch.FloatTensor(input_data).to(device)
        predictions = model(input_tensor)
    return predictions.cpu().numpy()

if __name__ == "__main__":
    
    main(embeddings, label_encoded)

Epoch 1/30: 100%|██████████| 1353/1353 [01:31<00:00, 14.74it/s]


Epoch 1/30:
Training Loss: 0.0051
Validation Loss: 0.0024


Epoch 2/30: 100%|██████████| 1353/1353 [01:32<00:00, 14.61it/s]


Epoch 2/30:
Training Loss: 0.0023
Validation Loss: 0.0020


Epoch 3/30: 100%|██████████| 1353/1353 [01:31<00:00, 14.71it/s]


Epoch 3/30:
Training Loss: 0.0020
Validation Loss: 0.0019


Epoch 4/30: 100%|██████████| 1353/1353 [01:34<00:00, 14.30it/s]


Epoch 4/30:
Training Loss: 0.0019
Validation Loss: 0.0019


Epoch 5/30: 100%|██████████| 1353/1353 [01:32<00:00, 14.64it/s]


Epoch 5/30:
Training Loss: 0.0018
Validation Loss: 0.0018


Epoch 6/30: 100%|██████████| 1353/1353 [01:33<00:00, 14.40it/s]


Epoch 6/30:
Training Loss: 0.0016
Validation Loss: 0.0018


Epoch 7/30: 100%|██████████| 1353/1353 [01:34<00:00, 14.25it/s]


Epoch 7/30:
Training Loss: 0.0015
Validation Loss: 0.0017


Epoch 8/30: 100%|██████████| 1353/1353 [01:36<00:00, 13.99it/s]


Epoch 8/30:
Training Loss: 0.0015
Validation Loss: 0.0017


Epoch 9/30: 100%|██████████| 1353/1353 [01:40<00:00, 13.48it/s]


Epoch 9/30:
Training Loss: 0.0014
Validation Loss: 0.0017


Epoch 10/30: 100%|██████████| 1353/1353 [01:38<00:00, 13.73it/s]


Epoch 10/30:
Training Loss: 0.0013
Validation Loss: 0.0017


Epoch 11/30: 100%|██████████| 1353/1353 [01:41<00:00, 13.36it/s]


Epoch 11/30:
Training Loss: 0.0012
Validation Loss: 0.0017


Epoch 12/30: 100%|██████████| 1353/1353 [01:41<00:00, 13.30it/s]


Epoch 12/30:
Training Loss: 0.0012
Validation Loss: 0.0017


Epoch 13/30: 100%|██████████| 1353/1353 [01:39<00:00, 13.58it/s]


Epoch 13/30:
Training Loss: 0.0011
Validation Loss: 0.0018


Epoch 14/30: 100%|██████████| 1353/1353 [01:41<00:00, 13.39it/s]


Epoch 14/30:
Training Loss: 0.0011
Validation Loss: 0.0018


Epoch 15/30: 100%|██████████| 1353/1353 [01:41<00:00, 13.29it/s]


Epoch 15/30:
Training Loss: 0.0010
Validation Loss: 0.0018


Epoch 16/30: 100%|██████████| 1353/1353 [01:42<00:00, 13.20it/s]


Epoch 16/30:
Training Loss: 0.0010
Validation Loss: 0.0019


Epoch 17/30: 100%|██████████| 1353/1353 [01:44<00:00, 12.99it/s]


Epoch 17/30:
Training Loss: 0.0009
Validation Loss: 0.0020
Early stopping triggered after 17 epochs


In [77]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

data_points = []
with open('icd_codes_1.txt', 'r') as f:
    for line in f:
        labels = [label.strip() for label in line.strip().split(';')]
        data_points.append(labels)
        
with open('icd_codes_2.txt', 'r') as f:
    for line in f:
        labels = [label.strip() for label in line.strip().split(';')]
        data_points.append(labels)


mlb = MultiLabelBinarizer(sparse_output=True)
target_matrix = mlb.fit_transform(data_points)
target_df = pd.DataFrame.sparse.from_spmatrix(
    target_matrix,
    columns=mlb.classes_
)

print(f"DataFrame shape: {target_df.shape}")
print(target_df.iloc[:5, :5])

DataFrame shape: (198982, 1400)
   A63.0  B07.0  B07.9  B35.1  B37.81
0      0      0      0      0       0
1      0      0      0      0       0
2      0      0      0      0       0
3      0      0      0      0       0
4      0      0      0      0       0


In [None]:
# Load the test embeddings
test_embeddings = np.load('test_data.npy')

In [None]:
import torch
import numpy as np
import pandas as pd
from torch import nn

# Define the model architecture again since we need it to load the state dict
class MultiLabelClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate=0.15):
        super(MultiLabelClassifier, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

# Function to make predictions with threshold
def predict_with_threshold(model, test_embeddings, threshold=0.5, batch_size=128, device='cuda'):
    model.eval()
    predictions = []
    
    # Convert to torch tensor
    test_tensor = torch.FloatTensor(test_embeddings)
    
    # Process in batches to avoid memory issues
    with torch.no_grad():
        for i in range(0, len(test_tensor), batch_size):
            batch = test_tensor[i:i + batch_size].to(device)
            batch_preds = model(batch)
            # Convert probabilities to binary predictions using threshold
            binary_preds = (batch_preds >= threshold).float()
            predictions.append(binary_preds.cpu().numpy())
    
    # Concatenate all batch predictions
    predictions = np.vstack(predictions)
    return predictions

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Model parameters (same as training)
    input_dim = 1024
    hidden_dims = [512, 256, 128]
    output_dim = 1400
    
    # Initialize model
    model = MultiLabelClassifier(input_dim, hidden_dims, output_dim).to(device)
    
    # Load the saved model
    checkpoint = torch.load('best_model.pth', map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print("Model loaded successfully")

    # test embedding shape
    print(f"Test embeddings shape: {test_embeddings.shape}")
    
    # Make predictions
    predictions = predict_with_threshold(model, test_embeddings, threshold=0.45, device=device)
    print(f"Predictions shape: {predictions.shape}")
    
    # Save predictions to CSV
    # Convert to dataframe with column names
    df_predictions = pd.DataFrame(
        predictions,
        columns=[f'label_{i}' for i in range(predictions.shape[1])]
    )
    
    # Save to CSV
    df_predictions.to_csv('test_predictions.csv', index=False)
    print("Predictions saved to test_predictions.csv")

if __name__ == "__main__":
    main()

Using device: cpu
Model loaded successfully
Test embeddings shape: (99490, 1024)


  checkpoint = torch.load('best_model.pth', map_location=device)


Predictions shape: (99490, 1400)
Predictions saved to test_predictions.csv


In [None]:
# Create a submission file in the specified format
# Map indices back to ICD10 codes
index_to_code = {v: k for k, v in code_to_index.items()}
test_labels_pred = pd.read_csv('test_predictions.csv')

In [81]:
test_labels_pred = test_labels_pred.to_numpy()

In [82]:
test_labels_pred.shape

(99490, 1400)

In [83]:
test_labels_pred

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [84]:
submission_data = []
for idx, label_vector in enumerate(test_labels_pred, start=1):
    # Get codes with predictions above the threshold and sort lexicographically
    codes = [index_to_code[i] for i, val in enumerate(label_vector) if val == 1]
    codes = sorted(codes)  # Sort lexicographically
    label_string = ';'.join(codes) if codes else ''  # Stitch with ';' or leave blank if no label
    submission_data.append({'id': idx, 'labels': label_string})

In [85]:
# Convert to DataFrame and save as CSV
submission_df = pd.DataFrame(submission_data)
submission_df

Unnamed: 0,id,labels
0,1,G56.21
1,2,M65.9;S83.242A
2,3,G56.01
3,4,M65.311;M65.312
4,5,S83.241A;S83.281A
...,...,...
99485,99486,K57.30;K63.5;K64.9
99486,99487,K29.50;K31.89;K90.0
99487,99488,D12.2;D12.5;K64.8;Z12.11
99488,99489,B96.81;K21.9;K29.50


In [86]:
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
