In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision.models import resnet50
import numpy as np
import pandas as pd
import cv2
import os
import pickle
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from PIL import Image
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from nltk.translate.bleu_score import sentence_bleu
from transformers import XLNetConfig, AutoConfig
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torchsummary import summary
from tensorboard.plugins import projector
from scipy.optimize import linear_sum_assignment
from utils.utils import *
from utils.assignment import *
from utils.latent_loss import *
from model.model import *

# Set Cuda

In [2]:
def format_pytorch_version(version):
    return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
    return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# DataSet & DataLoader

In [3]:
tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join('tokenizer','vocab.json')),
    os.path.abspath(os.path.join('tokenizer','merges.txt'))
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
print(tokenizer.encode('the cardiac silhouette and mediastinum size are within normal limits').ids)
print(tokenizer.decode([1, 267, 650, 465, 317, 695, 424, 299, 426, 360, 476, 2]))

[1, 267, 650, 465, 317, 695, 424, 299, 426, 360, 476, 2]
<s>the cardiac silhouette and mediastinum size are within normal limits</s>


In [4]:
class XRayDataset(Dataset):
    def __init__(self, data, img_dir, transform, max_sentence, tokenizer, max_t): ## add max_sentence
        self.data = data
        self.img_dir = img_dir
        self.transform = transform
        self.max_sentence = max_sentence
        self.max_t = max_t
        
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.data.iloc[idx, 0])
        image = Image.open(img_path).resize((256, 256))
        label = self.data.iloc[idx, 2]
        
        image = transform_img(image, self.transform)
        _, label = tokenize_report(label)
        if '' in label:
            label.remove('')
        
        id_ = torch.zeros((self.max_sentence, self.max_t)).long()
        for i, sent in enumerate(label):
            input_ids = self.tokenizer.encode(sent).ids
            for j, word in enumerate(input_ids):
                id_[i, j] = word
        
        len_ = len(label)
        len_ = torch.tensor(len_, dtype=torch.int32)
        
        return image[0], id_, len_

In [5]:
df_train = pd.read_csv('data/train_clean.csv')
df_test = pd.read_csv('data/testing_set.csv')

df_train = df_train[np.logical_not(df_train.Findings == 'startseq  endseq')]
df_test = df_test[np.logical_not(df_test.Findings == 'startseq  endseq')]

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

img_path = 'data/images'
max_sentence = 18
max_t = 60

transform = T.Compose([
    T.Resize(256),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join('tokenizer','vocab.json')),
    os.path.abspath(os.path.join('tokenizer','merges.txt'))
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

vocab_size = len(tokenizer.get_vocab())

In [6]:
train_data = XRayDataset(df_train, img_path, transform, max_sentence, tokenizer, max_t)
test_data = XRayDataset(df_test, img_path, transform, max_sentence, tokenizer, max_t)
train_loader = DataLoader(train_data, batch_size=8, shuffle=False)
test_loader = DataLoader(test_data, batch_size=8, shuffle=False)

# Model

In [14]:
class SetPredictLSP(nn.Module):
    def __init__(self, device, hidden_dim=384, nhead=4, nlayers=3, max_sentence=18, vocab_size=3160, pad_token_id=0, max_t=60, dropout=0.4):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.cnn_text = CNN_Text_SinCos(hidden_dim=hidden_dim, device=device)
        self.hungarian = MSEGCRLatentLoss()
        
        self.encoder = SentenceEncoder(pad_token_id=pad_token_id, n_hid=hidden_dim, max_t=max_t, vocab_size=vocab_size, dropout=dropout)
        self.decoder = LSPDecoder(hidden_dim=hidden_dim, vocab_size = vocab_size, pad_token_id=pad_token_id, max_t=max_t, dropout=dropout)
        
        self.mlp = MLP().to(device)
        self.mlp.load_state_dict(torch.load('model/weights/length_model_best.pt'))
        
        self.max_t = max_sentence
        
    def forward(self, X, input_id, len_, labels):
        '''
        X is image (bs, 3, H, W)
        R's shape is (bs, length, hidden_dim)
        feat will be use for the transformer decoder
        '''
        bs = X.shape[0]
        R, feat = self.cnn_text(X) 
        context_series = expand_by_lengths(feat, len_)
        
        B = self.encoder(input_id, context_series)
        R = flat_by_lengths(R, (torch.ones((bs))*20).to(device))
        
        R_pi, R_i, hung_loss = self.hungarian.forward(B, len_, R, (torch.ones(bs, dtype=int)*20).to(device))
        
        
        ce_loss, logit = self.decoder(input_id, R_pi, context_series, labels)
        
        return logit, ce_loss, hung_loss
    
    def predictLSP(self, X, input_id, len_, labels):
        bs = X.shape[0]
        R, feat = self.cnn_text(X)
        context_series = expand_by_lengths(feat, len_)
        
        B = self.encoder(input_id, context_series)
        R = flat_by_lengths(R, (torch.ones((bs))*20).to(device))
        
        R_pi, R_i, hung_loss = self.hungarian.forward(B, len_, R, (torch.ones(bs, dtype=int)*20).to(device))
        
        return R_pi
    
    def text_vec(self, X, input_id, len_, labels):
        bs = X.shape[0]
        R, feat = self.cnn_text.evaluate(X, len_[0]) 
        
        context_series = expand_by_lengths(feat, len_)
        
        B = self.encoder(input_id, context_series)
        R = flat_by_lengths(R, len_)
        
        R_pi, R_i, hung_loss = self.hungarian.forward(B, len_, R, len_)
        
        return R_pi, context_series, hung_loss
    
    def decode_sentence(self, input_ids, text_vec, context_series):
        pred = self.decoder.eval_forward(input_ids, text_vec, context_series)
        return pred

# Inference with Teacher Forcing

In [110]:
model = SetPredictLSP(device, hidden_dim=32).to(device)

model.eval()
model.load_state_dict(torch.load('model/weights/SetPredict_ver10.pt'))

<All keys matched successfully>

In [111]:
actual_test = []
predicted_test = []
LSP = []

for X, id_, len_ in test_loader:  # Iterate in batches over the training/test dataset.
    label = flat_by_lengths_max_t(id_.to(device), len_.to(device), 18)
    input_id = flat_by_lengths_max_t(id_.to(device), len_.to(device), 18)
    X = X.to(device)
    len_ = len_.to(device)

    lsp = model.predictLSP(X, input_id, len_, label)
    out, _, _ = model(X, input_id, len_, label)
    
    actual_test.append(label.cpu().detach().numpy())
    predicted_test.append(out.cpu().detach().numpy())
    LSP.append(lsp.cpu().detach().numpy())

In [112]:
predicted_test = np.concatenate(predicted_test)
actual_test = np.concatenate(actual_test)
LSP = np.concatenate(LSP)

## LSP

In [113]:
print(predicted_test.shape, actual_test.shape, LSP.shape)

(1937, 60, 3160) (1937, 60) (1937, 32)


In [12]:
predicted_sentences = []
actual_sentences = []
for id_ in range(predicted_test.shape[0]):
    ref = actual_test[id_]
    cad = predicted_test.argmax(axis=-1)[id_]

    ref_eos = np.argwhere(ref==2).min()
    cad_eos = np.argwhere(cad==2).min()

    ref_sent = tokenizer.decode(ref[1: ref_eos])
    cad_sent = tokenizer.decode(cad[0: cad_eos])
    
    predicted_sentences.append(ref_sent)
    actual_sentences.append(cad_sent)

ValueError: zero-size array to reduction operation minimum which has no identity

In [None]:
pickle.dump(predicted_sentences, open('predicted_sentences.pkl', 'wb'))
pickle.dump(actual_sentences, open('actual_sentences.pkl', 'wb'))
pickle.dump(LSP, open('lsp.pkl', 'wb'))

## Sentence

In [40]:
id_ = 1700

ref = actual_test[id_]
cad = predicted_test.argmax(axis=-1)[id_]

ref_eos = np.argwhere(ref==2).min()
cad_eos = np.argwhere(cad==2).min()

ref_sent = tokenizer.decode(ref[1: ref_eos])
cad_sent = tokenizer.decode(cad[0: cad_eos])

print("Actual Sentence: {}".format(ref_sent))
print("Predicted Sentence: {}".format(cad_sent))

score = sentence_bleu([ref_sent.split()], cad_sent.split())
print(score)

Actual Sentence: normal heart size and mediastinum
Predicted Sentence: the heart size
5.819186114595022e-155


In [114]:
scores1 = []
scores2 = []
for id_ in range(predicted_test.shape[0]):
    ref = actual_test[id_]
    cad = predicted_test.argmax(axis=-1)[id_]
    if 2 not in cad:
        cad[-1] = 2

    ref_eos = np.argwhere(ref==2).min()
    cad_eos = np.argwhere(cad==2).min()

    ref_sent = tokenizer.decode(ref[1: ref_eos])
    cad_sent = tokenizer.decode(cad[0: cad_eos])

    score1 = sentence_bleu([ref_sent.split()], cad_sent.split())
    score2 = sentence_bleu([cad_sent.split()], ref_sent.split())
    scores1.append(score1)
    scores2.append(score2)

In [115]:
print(np.array(scores1).mean())
print(np.array(scores2).mean())

0.20303155490506922
0.2028069926429635


# Inference

In [116]:
train_data = XRayDataset(df_train, img_path, transform, max_sentence, tokenizer, max_t)
test_data = XRayDataset(df_test, img_path, transform, max_sentence, tokenizer, max_t)
train_loader = DataLoader(train_data, batch_size=1, shuffle=False)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [16]:
model = SetPredictLSP(device, hidden_dim=32).to(device) ## Ver10: hidden dims' 32

model.eval()
model.load_state_dict(torch.load('model/weights/SetPredict_ver10.pt')) 

<All keys matched successfully>

In [117]:
actual = []
predicted = []
len_sent = []
hung = 0


for X, id_, len_ in test_loader:  # Iterate in batches over the training/test dataset.
    label = flat_by_lengths_max_t(id_.to(device), len_.to(device), 18)
    input_id = flat_by_lengths_max_t(id_.to(device), len_.to(device), 18)
    X = X.to(device)
    len_ = len_.to(device)

    text_vec, context_series, loss_hung = model.text_vec(X, input_id, len_, label)
    bs, _ = text_vec.shape
    
    ys = torch.ones(bs, 1).type(torch.long).to(device)
    for i in range(max_t):
        out = model.decode_sentence(ys, text_vec, context_series)
        next_word = torch.unsqueeze(out[:, -1], 1).argmax(axis=-1)

        ys = torch.cat([ys, next_word], dim=1)
     
    actual.append(input_id.cpu().detach().numpy())
    predicted.append(ys.cpu().detach().numpy())
    len_sent.append(len_.cpu().detach().numpy())
    hung += loss_hung.cpu().detach().numpy()

In [118]:
predicted = np.concatenate(predicted)
actual = np.concatenate(actual)
len_sent = np.concatenate(len_sent)

In [119]:
id_ = 16

ref = actual[id_]
cad = predicted[id_][1:]

ref_eos = np.argwhere(ref==2).min()
cad_eos = np.argwhere(cad==2).min()

ref_sent = tokenizer.decode(ref[1: ref_eos])
cad_sent = tokenizer.decode(cad[0: cad_eos])

print("Actual Sentence: {}".format(ref_sent))
print("Predicted Sentence: {}".format(cad_sent))

score = sentence_bleu([ref_sent.split()], cad_sent.split())
print(score)

Actual Sentence: there is no pneumothorax or pleural effusion
Predicted Sentence: there is no pneumothorax or pleural effusion
1.0


In [120]:
actual_sent = []
predicted_sent = []
for id_ in range(predicted.shape[0]):
    ref = actual[id_]
    cad = predicted[id_][1:]

    ref_eos = np.argwhere(ref==2).min()
    cad_eos = np.argwhere(cad==2).min()

    ref_sent = tokenizer.decode(ref[1: ref_eos])
    cad_sent = tokenizer.decode(cad[0: cad_eos])

    actual_sent.append(ref_sent)
    predicted_sent.append(cad_sent)

cdf = [0]

for i in len_sent:
    cdf.append(cdf[-1]+i)

In [121]:
actual_split_sent = []
predicted_split_sent = []
for i, l in enumerate(cdf[:-1]):
    start, stop = cdf[i], cdf[i+1]
    
    predicted_buff = []
    actual_buff = []
    for j in range(start, stop):
        actual_buff.append(actual_sent[j].split())
        predicted_buff.append(predicted_sent[j].split())
    
    actual_split_sent.append(actual_buff)
    predicted_split_sent.append(predicted_buff)

In [125]:
scores1 = []
scores2 = []
for actual_report, predicted_report in list(zip(actual_split_sent, predicted_split_sent)):
    
    for sentences in actual_report:
        score = sentence_bleu(predicted_report, sentences)
        scores1.append(score)
        
    for sentences in predicted_report:
        score = sentence_bleu(actual_report, sentences)
        scores2.append(score)
    

In [126]:
scores1 = np.array(scores1)
scores2 = np.array(scores2)
print(scores1.mean())
print(scores2.mean())

0.07204745476515174
0.07517182946437352
