In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
from pathlib import Path

In [2]:
# File Structure
path = Path(os.getcwd())
root = Path(path.parent.absolute()) 

train_caption_path = root / 'ROCO' / 'Data' / 'Train' / 'radiology' / 'captions.txt'
train_image_path = root / 'ROCO' / 'Data' / 'Train' / 'radiology' / 'images'

test_caption_path = root / 'ROCO' / 'Data' / 'Test' / 'radiology' / 'captions.txt'
test_image_path = root / 'ROCO' / 'Data' / 'Test' / 'radiology' / 'images'

target_path = root / 'Shared Preprocessed Objects'

In [3]:
# Parameters that must be defined by the user!!!
FREQUENCY_CUTOFF = 20
EMBEDDING_DIM = 300 # either 200 or 300 for the current path, can be more values otherwise.
BATCH_SIZE = 32
variable_params = {}

In [4]:
# Get captions from .txt files and parse properly
def get_captions(caption_path):
    ## Reading each image file name and its associated caption
    doc = open(caption_path, 'r', encoding = 'utf-8').read()
    
    splitDoc = doc.split('\n')
    while('' in splitDoc):
        splitDoc.remove('')
    
    descriptions = {}
    for line in splitDoc:
        splitLine = line.split('\t')
        descriptions[splitLine[0]] = splitLine[1]
    
    return descriptions    

In [5]:
train_captions = get_captions(train_caption_path)
test_captions = get_captions(test_caption_path)

print(len(train_captions), len(test_captions))

65450 8179


In [6]:
import re
# Treat caption to remove symbols that will not be considered by the model
def treat_captions(data):
    for imageID, caption in data.items():
        caption = caption.lower()
        caption = re.sub(r'[^\w\s]', '', caption)
        data[imageID] = caption
    return data

In [7]:
train_captions = treat_captions(train_captions)
test_captions = treat_captions(test_captions)

In [8]:
# add start and end tokens to the captions
def add_start_end_tokens(data):
    final_data = {}
    
    for imageId, caption in data.items():
        final_data[imageId] = 'startseq ' + caption + ' endseq'
    
    return final_data

In [9]:
train_captions = add_start_end_tokens(train_captions)
test_captions = add_start_end_tokens(test_captions)

In [10]:
train_captions

{'ROCO_00002': 'startseq  computed tomography scan in axial view showing obliteration of the left maxillary sinus endseq',
 'ROCO_00003': 'startseq  bacterial contamination occurred after completion of root canal treatment in the tooth which remained with a temporary filling for 15 month endseq',
 'ROCO_00004': 'startseq  the patient had residual paralysis of the hand after poliomyelitis it was necessary to stabilize the thumb with reference to the index finger this was accomplished by placing a graft from the bone bank between the first and second metacarpals the roentgenogram shows the complete healing of the graft one year later endseq',
 'ROCO_00005': 'startseq  panoramic radiograph after immediate loading endseq',
 'ROCO_00007': 'startseq  plain abdomen xray multiple air levels at the midabdomen arrows no radiopaque shadow and no air under the diaphragm endseq',
 'ROCO_00008': 'startseq  a 3yearold child with visual difficulties axial flair image show a suprasellar lesion extendin

In [11]:
# Save preprocessed captions
np.save(target_path / 'train_captions', train_captions)
np.save(target_path / 'test_captions', test_captions)

In [12]:
# Get all words used at least once in the captions
def get_full_vocab(data):
    vocab = []
    for caption in data.values():
        captionSplit = caption.split(' ')
        for word in captionSplit:
            vocab.append(word)

    vocab = set(vocab)
    return vocab

In [13]:
full_vocab = get_full_vocab(train_captions)

len(full_vocab)

42925

In [14]:
# Get reduced vocab with only words that have a frequency larger than frequency_limit
def reduce_vocab(data, frequency_limit):
    word_counts = {}
    
    for imageId, caption in data.items():
        for w in caption.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1

    vocab_reduced = [w for w in word_counts if word_counts[w]>=frequency_limit]

    return vocab_reduced

In [15]:
reduced_vocab = reduce_vocab(train_captions, FREQUENCY_CUTOFF)
vocab_size = len(reduced_vocab)
variable_params['vocab_size'] = vocab_size + 1
vocab_size + 1

4471

In [16]:
# Create word to int correspondence
def get_word_token_correspondence(vocab_reduced):
    
    index2Word = {}
    word2Index = {}

    index = 1
    for w in vocab_reduced:
        word2Index[w] = index
        index2Word[index] = w
        index += 1
        
    return index2Word, word2Index

In [17]:
index2Word, word2Index = get_word_token_correspondence(reduced_vocab)
print(len(index2Word), len(word2Index))

4470 4470


In [18]:
np.save(target_path / 'index2Word', index2Word)
np.save(target_path / 'word2Index', word2Index)

In [19]:
# Find the maximum caption lenght with the reduced vocabulary, omiting words that do not belong to it 
def get_max_caption_len(data, word2Index):
    
    max_len = 0
    for imageId, caption in data.items():
        seq = [word2Index[word] for word in caption.split(' ') if word in word2Index]
        if len(seq) > max_len:
            max_len = len(seq)

    return max_len

In [20]:
max_caption_len = get_max_caption_len(train_captions, word2Index)
variable_params['max_caption_len'] = max_caption_len 
max_caption_len

341

In [21]:
np.save(target_path / 'variable_params', variable_params)

In [22]:
import glob
# Get all image paths that will be used for preprocessing later
def get_all_image_paths(img_path):
    images = glob.glob(str(img_path) + '\*.jpg')
    
    images_dict = {}
    for element in images:
        start_index = element.find('ROCO_')
        images_dict[element[start_index:start_index+10]] = element
    
    return images_dict

In [23]:
#This step is required because the captions may refer to images that are not in this image path
#Takes a minute
train_image_paths = get_all_image_paths(train_image_path)
test_image_paths = get_all_image_paths(test_image_path)

print(len(train_image_paths), len(test_image_paths))

100 50


In [24]:
def get_paths_from_set(data, image_paths):

    images = []
    for imageId in data:
        if imageId in image_paths:
            images.append(image_paths[imageId])
            
    return images

In [25]:
train_image_paths = get_paths_from_set(train_captions, train_image_paths)
test_image_paths = get_paths_from_set(test_captions, test_image_paths)

print(len(train_image_paths), len(test_image_paths))

100 50


In [26]:
#This is not required but I was using while writing this to waste less time. Remove after complete
np.save(target_path / 'train_image_paths', train_image_paths)
np.save(target_path / 'test_image_paths', test_image_paths)