In [None]:
import utils
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

In [None]:
HPO_PATH = 'data/hp.obo'
LABEVENTS_HPO_PATH = 'data/OUT_LABEVENTS_HPO.csv'
DIAGNOSES_HPO_PATH = 'data/DIAGNOSE_ICD_hpo.csv'


In [None]:
hpo = utils.read_hpo_from_obo(HPO_PATH)
labevents_df = pd.read_csv(LABEVENTS_HPO_PATH).fillna('')
diagnoses_df = pd.read_csv(DIAGNOSES_HPO_PATH).fillna('')


In [None]:
class Subject:
    def __init__(self, id):
        self.id = id
        self.labevents: set[str] = set()
        self.diagnoses: set[str] = set()

    def labevent_vector(self, feature_list: list[str]):
        return [int(feature in self.labevents) for feature in feature_list]

    def diagnoses_vector(self, feature_list: list[str]):
        return [int(feature in self.diagnoses) for feature in feature_list]

    def __repr__(self) -> str:
        return f'<Subject {self.id}>'


In [None]:
subjects: dict[int, Subject] = {}
all_present_hpo_features: set[str] = set()


In [None]:
for _, line in labevents_df.iterrows():
    subject_id = line.subject_id
    if line.selected_hpo_features != '':
        hpo_features = line.selected_hpo_features.split(';')
        all_present_hpo_features.update(hpo_features)
        subjects.setdefault(subject_id, Subject(subject_id)).labevents.update(hpo_features)


In [None]:
for _, line in diagnoses_df.iterrows():
    subject_id = line.subject_id
    if line.hpo_features != '':
        hpo_features = line.hpo_features.split(';')
        all_present_hpo_features.update(hpo_features)
        subjects.setdefault(subject_id, Subject(subject_id)).diagnoses.update(hpo_features)


In [None]:
feature_list = [e for e in all_present_hpo_features]
hpo_to_id = {feature: i for i, feature in enumerate(feature_list)}


## Model Creation

### Dataset Creation

In [None]:
# Dataloader, lädt die daten aus der Tabelle in List
input_data: list[list[int]] = [subject.labevent_vector(feature_list) for subject in subjects.values()]
target_data: list[list[int]] = [subject.diagnoses_vector(feature_list) for subject in subjects.values()]

input_tensor, target_tensor = torch.FloatTensor(input_data), torch.FloatTensor(target_data)

dataset = torch.utils.data.TensorDataset(input_tensor, target_tensor)

# Unterteilung der List in 3 sets
train_size = int(len(dataset)*0.7)
val_size = int(len(dataset)*0.2)
test_size = len(dataset)-(train_size + val_size)



train_set, val_set, test_set = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

### Model generation

In [None]:
# Model Parameter
enlarging_factor = 1.2
input_size = len(feature_list)
output_size = len(feature_list)
hidden_size = int(input_size*enlarging_factor)

In [None]:
#device selection, where NN is trained
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

l1 = nn.Linear(input_size, hidden_size)
nn.init.xavier_uniform_(l1.weight)
l2 = nn.Linear(hidden_size, hidden_size)
nn.init.xavier_uniform_(l2.weight)
l3 = nn.Linear(hidden_size, output_size)
nn.init.xavier_uniform_(l3.weight)

# define model architecture and move to cuda
model = nn.Sequential(
    l1,
    nn.Tanh(),
    l2,
    nn.Tanh(),
    l3,
    nn.Sigmoid()
)  
model.to(device)   

In [None]:
# Trainingsparameter

batch_size = 8
learning_rate = 1e-4        
num_epochs = 20
log_rhythm = 5

In [None]:
# Bestimmung des Optimizers

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate,
    betas=(0.9, 0.999),
    ) 

In [None]:
# definition of loss function
loss_func = nn.CrossEntropyLoss()

In [None]:
# definition of accuracy function
def calc_accuracy(output, target)->float:
    result = np.zeros(output.shape)
    
    number_of_features = target.sum(axis=1)
    correctly_identified = (target * np.sqrt(output)).sum(axis=1)
    return np.mean(correctly_identified / (number_of_features + .00001))

In [None]:
# definition of real effect function
def real_effect(outputs, targets):
    correct_diagnosed = 0
    false_positive = 0
    false_negative = 0
    total_to_diagnose = sum(targets[0])
    
    for i in range(len(outputs[0])):
        if(outputs[0,i]>=0.5 and targets[0,i]==1):
            correct_diagnosed += 1
        if(outputs[0,i]<0.5 and targets[0,i]==1):
            false_negative += 1
        if(outputs[0,i]>0.5 and targets[0,i]==0):
            false_positive += 1
            
    print("Correct diagnoses:" f'{correct_diagnosed}/{total_to_diagnose}')
    print("False positives:" f'{false_positive}')
    print("False negatives:" f'{false_negative}\n')
    

### Creation of datapipeline

In [None]:
# Erstellung der Dataloader

train_loader = torch.utils.data.DataLoader(train_set,batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=True)

### Training

In [None]:
val_loss_history = []
val_acc_history = []
for epoch in range(num_epochs):
    print(f'[Epoch {epoch+1}/{num_epochs}]')
    
    train_loss_history = []
    train_acc_history = []
    
    for i, (inputs, targets) in enumerate(train_loader, 1):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_func(outputs, targets)
        loss.backward()
        optimizer.step()
        
        # Loss output after log_rhythm of iterations
        train_loss_history.append(loss.cpu().detach().numpy())
        train_acc_history.append(calc_accuracy(outputs.cpu().detach().numpy(), targets.cpu().detach().numpy()))
        
        
        
        
        if i % log_rhythm == 0:
            last_log_rhythm_losses = train_loss_history[-log_rhythm:]
            train_loss = np.mean(last_log_rhythm_losses)
            
            #train_acc_history.flatten()
            last_log_rhythm_acc = train_acc_history[-log_rhythm:]
            train_acc = np.mean(last_log_rhythm_acc)
            
            print(f'[Iteration {i}]\tTRAIN      loss/acc: {train_loss:.3f}\t{train_acc:.3f}')
        
        # Acc computation during after log_rhythm of iterations
        
    # Loss and acc output after an epoch        
    train_loss =  np.mean(train_loss_history)
    train_acc = np.mean(train_acc_history)
    print(f'for this epoch:\tTRAIN      loss/acc: {train_loss:.3f}\t{train_acc:.3f}')

        
    # Validation after an epoch
    val_losses = []
    val_acc = []
    model.eval()
    for inputs, targets in val_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs)
        loss = loss_func(outputs, targets)
        val_losses.append(loss.detach().cpu().numpy())
        val_acc.append(calc_accuracy(outputs.cpu().detach().numpy(), targets.cpu().detach().numpy()))
    
    # Training step after an epoch
    model.train()
    
    val_loss =  np.mean(val_losses)
    val_acc = np.mean(val_acc)
    
    # Output of Validation loss
    val_loss_history.append(val_loss)
    print(f'\t\tVALIDATION loss/acc: {val_loss:.3f}\t{val_acc:.3f}')
    real_effect(outputs, targets)
    print("\n")
    

### Testing

In [None]:
test_acc = []
for inputs, targets in test_loader:
    
    inputs, targets = inputs.to(device), targets.to(device)

    outputs = model(inputs)
    test_acc.append(calc_accuracy(outputs.cpu().detach().numpy(), targets.cpu().detach().numpy()))
    real_effect(outputs, targets)

    
test_acc = np.mean(test_acc)

print(f'Test Accuracy: {test_acc:.3f}')

for i in range(len(feature_list)):
    print(f'{feature_list[i]}\t{outputs[0,i]:.2f}\t{targets[0,i]}')