In [None]:
import json
import sys
import collections
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
sys.path.append('../src')
import cb_utils

In [None]:
raw = cb_utils.sql_query_to_df('SELECT * FROM junk.cui_ndc_hcc_dataset;')

In [None]:
n_samples = len(raw)
n_samples 

In [None]:
raw.head()

In [None]:
# raw.to_csv('data/20210823_raw.csv')

### Create lookups for cui and hccs

In [None]:
cuis = collections.Counter()
for r in raw.rscuis:
    cuis.update(r)

In [None]:
n_cuis = len(cuis)
n_cuis  

In [None]:
# raw.categories.apply(lambda x: len(x) if x is not None else 0).describe()

In [None]:
raw.rscuis.apply(lambda x: len(x)).describe()

In [None]:
# list(reversed(cuis.most_common(1000)))[:10]

In [None]:
cui_lookup = {}
for i, (cui, cnt) in enumerate(cuis.most_common()):
    cui_lookup[i] = cui

In [None]:
cui_idx_lookup = {v: k for k, v in cui_lookup.items()}

In [None]:
hccs = collections.Counter()
for r in raw.categories:
    hccs.update(r)

In [None]:
n_hccs = len(hccs)
n_hccs

In [None]:
# hccs.most_common()

In [None]:
hcc_lookup = {}
for i, (hcc, cnt) in enumerate(hccs.most_common()):
    hcc_lookup[i] = hcc
hcc_idx_lookup = {v: k for k, v in hcc_lookup.items()}

In [None]:
X = np.zeros((n_samples, n_cuis))
Y = np.zeros((n_samples, n_hccs))

In [None]:
for i, r in raw.iterrows():
    for cui in r.rscuis:
        X[i, cui_idx_lookup[cui]] = 1
    
    if r.categories is not None:
        for hcc in r.categories:
            Y[i, hcc_idx_lookup[hcc]] = 1

In [None]:
# np.save('data/X.npy', X)
# np.save('data/Y.npy', Y)

### Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

### MLP

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
    

In [None]:
class AverageMeter():
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def train_loop(model, X_train, y_train, batch_size):
    losses = AverageMeter()
    for i in range(0, X_train.shape[0] // batch_size):
        batch_start = i * batch_size
        batch_end = (i + 1) * batch_size

        x = torch.tensor(X_train[batch_start: batch_end], dtype=torch.float)
        y = torch.tensor(y_train[batch_start: batch_end])
        
        output = model(x)
        loss = criterion(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        l = loss.item()
        losses.update(l, output.shape[0])
        
    print('Loss: {:.3f}'.format(l))
            
    return losses.avg
        
def validation_loop(model, X_val, y_val, batch_size):
    losses = AverageMeter()
    for i in range(0, X_val.shape[0] // batch_size):
        batch_start = i * batch_size
        batch_end = (i + 1) * batch_size

        x = torch.tensor(X_val[batch_start: batch_end], dtype=torch.float)
        y = torch.tensor(y_val[batch_start: batch_end])
        
        with torch.no_grad():
            output = model(x)
            loss = criterion(output, y)
            l = loss.item()
            losses.update(l, output.shape[0])
    print('val Loss: {:.3f}'.format(losses.avg))
    return losses.avg

In [None]:
mlp_model = nn.Sequential(
#     nn.BatchNorm1d(n_cuis),
    nn.Linear(n_cuis, 32),
    nn.ReLU(),
    nn.BatchNorm1d(32),
    nn.Dropout(),
    nn.Linear(32, 128),
    nn.ReLU(),
    nn.BatchNorm1d(128),
    nn.Dropout(),
    nn.Linear(128, n_hccs),
#     nn.Sigmoid()
)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(mlp_model.parameters())
# if torch.cuda.is_available():

In [None]:
train_losses = []
val_losses = []

for epoch in range(10):
    print('EPOCH: ', epoch + 1)
    train_loss = train_loop(mlp_model, X_train, y_train, 256)
    val_loss = validation_loop(mlp_model, X_val, y_val, 256)
    train_losses.append(train_loss)
    val_losses.append(val_loss)

In [None]:
fig, ax = plt.subplots()  # Create a figure and an axes.
ax.plot(train_losses, label='Train')  # Plot some data on the axes.
ax.plot(val_losses, label='Val')
ax.legend()


In [None]:
preds = torch.sigmoid(mlp_model(torch.tensor(X_val, dtype=torch.float)))
labels = torch.tensor(y_val)
pred_labels = torch.zeros_like(preds)
pred_labels[preds > 0.1] = 1

tp = torch.sum(pred_labels + labels == 2, axis=1, dtype=torch.float)
tn = torch.sum(pred_labels + labels == 0, axis=1, dtype=torch.float)
fp = torch.sum(pred_labels - labels == 1, axis=1, dtype=torch.float)
fn = torch.sum(pred_labels - labels == -1, axis=1, dtype=torch.float)

acc = (tp + tn) / (tp + tn + fp + fn) 

recall = tp / (tp + fn)
# recall[recall.isnan()] = 1
'recall: ', recall[~recall.isnan()].mean(), 'acc: ', acc.mean(), fn.mean(), fn.max(), fp.mean(), fp.max(), fp.median(), fp.std(), tp.mean()