In [13]:
import numpy as np

import os
from pathlib import Path

In [2]:
path = Path(os.getcwd())
root = Path(path.parent.parent.absolute())

train_cuis_path = root / 'ROCO' / 'Data' / 'Train' / 'radiology' / 'cuis.txt'
test_cuis_path = root / 'ROCO' / 'Data' / 'Test' / 'radiology' / 'cuis.txt'
model_path = root / 'Models' / 'Retrained Inception'
#fetching_path = root / 'Shared Preprocessed Objects'

In [3]:
CUI_FREQUENCY_CUTOFF = 100

In [4]:
import re
def get_cuis(cuis_path):
    doc = open(cuis_path, 'r', encoding = 'utf-8').read()
    
    cuis = {}
    splitDoc = doc.split('ROCO')

    splitDoc = ['ROCO' + x for x in splitDoc][1:]
    splitDoc = [x.split('\t') for x in splitDoc]
    #splitDoc = [x.split('\n') for x in splitDoc]

    for index, line in enumerate(splitDoc):
        splitDoc[index] = [re.sub('\n', '', x) for x in line]#[x.split('\n') for x in line]

    for index, line in enumerate(splitDoc):
        splitDoc[index] = [x for x in line if x != '']

    for index, line in enumerate(splitDoc):
        cuis[line[0]] = [x for x in line[1:]]

    #splitDoc
    return cuis

In [5]:
cuis = get_cuis(train_cuis_path)
#cuis

In [6]:
def get_vocab_cuis(cuis):
    vocab = []
    for img in cuis:
        #captionSplit = caption.split(' ')
        for cui in cuis[img]:
            vocab.append(cui)
    vocab = list(set(vocab))
    return vocab 

In [7]:
cui_vocab = get_vocab_cuis(cuis)
print(len(cui_vocab))

5817


In [8]:
def reduce_cui_vocab(cuis, frequency_limit):
    cui_counts = {}
    
    for img in cuis:
        #num_sents +=1
        for cui in cuis[img]:
            if cui in cui_counts:
                cui_counts[cui] += 1#word_counts.get(w, 0) + 1
            else:
                cui_counts[cui] = 0

    vocab_reduced = [w for w in cui_counts if cui_counts[w]>=frequency_limit]

    #print('Reduced Vocabulary size: ', len(vocab_reduced))
    return vocab_reduced

In [9]:
reduced_cuis =  reduce_cui_vocab(cuis, CUI_FREQUENCY_CUTOFF)
#reduced_cuis

In [10]:
def get_cui_mapping(vocab_reduced):
    index2Cui = {}
    cui2Index = {}

    index = 0
    for w in vocab_reduced:
        cui2Index[w] = index
        index2Cui[index] = w
        index += 1

    return cui2Index, index2Cui, len(cui2Index)

In [11]:
cui2Index, index2Cui, cui_vocab_size = get_cui_mapping(reduced_cuis)
cui_vocab_size

745

In [14]:
np.save(model_path / 'cui2Index', cui2Index)
np.save(model_path / 'index2Cui', index2Cui)

In [15]:
def get_cui_matrix(cuis, vocab_reduced, vocab_size, cui2Index):
    training_pairs = {}
    counter = 0
    for img in cuis:
        training_pairs[img] = [cui2Index[cui] for cui in cuis[img] if cui in vocab_reduced]
        if len(training_pairs[img]) == 0:
            counter += 1
    
    print(str(counter) + ' of the images used have 0 CUIs after the vocabulary reduction. Out of ' 
          + str(len(training_pairs)))
    cuis_matrix = {}
    #np.zeros((len(training_pairs), vocab_size))

    for index, img in enumerate(training_pairs):
        tmp = np.zeros(vocab_size)
        for value in training_pairs[img]:
            tmp[value] = 1
        cuis_matrix[img] = tmp
        #cuis_matrix[index][training_pairs[img]] = 1

    return training_pairs, cuis_matrix

In [16]:
training_pairs, cuis_matrix = get_cui_matrix(cuis, reduced_cuis, cui_vocab_size, cui2Index)
len(cuis_matrix)

2096 of the images used have 0 CUIs after the vocabulary reduction. Out of 65450


65450

In [17]:
np.save(model_path / 'training_pairs', training_pairs)
np.save(model_path / 'cuis_matrix', cuis_matrix)