In [3]:
import io
import json
import os
import pickle
from collections import Counter, OrderedDict
from collections import defaultdict

import numpy as np
import pandas as pd
import torch
from nltk.tokenize import sent_tokenize, word_tokenize
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer

In [3]:
dir_dataset = os.path.expanduser('~/Documents/master3/leomed_scratch/files_small')
fn_findings = os.path.join(dir_dataset, 'train_findings.csv')
print(os.path.exists(fn_findings))
report_findings = pd.read_csv(fn_findings)['findings']

True


## Implementing Word2Vec

In [15]:
data = ''
for sentence in report_findings:
    data+= sentence

In [17]:
print(len(data))

17950822
{'M', '0', 'f', 'G', "'", 's', 'N', '5', ',', 'i', 'Z', 'u', 'W', 'q', '&', 'P', 'O', 'V', '~', 'k', ')', 'v', 'e', '9', 'z', 'y', 'n', '4', 'g', 'U', '3', 'F', 'J', '?', 'D', '%', 'h', '"', 'r', 'd', '>', 'R', 'x', 'w', 'A', 'Y', '1', 't', 'b', '\\', 'm', 'I', '8', 'B', '(', 'E', 'S', ';', '7', '=', ':', 'T', 'Q', '/', 'p', 'a', '2', '.', ' ', 'K', 'o', 'H', 'C', 'L', 'X', 'l', '6', '-', '_', 'j', 'c'}


In [22]:
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(data)
print(len(tokens))
tokens = list(set(tokens))
print(len(tokens))

2994232
5942


In [24]:
word2idx = {w: idx for (idx, w) in enumerate(tokens)}
idx2word = {idx: w for (idx, w) in enumerate(tokens)}
vocab_size = len(tokens)

In [25]:
print(idx2word)


{0: 'replete', 1: 'do', 2: '8-cm', 3: 'deposition', 4: 'that', 5: 'straightened', 6: 'ascending', 7: 'hemidiaphraph', 8: "'", 9: '10', 10: 'congenital', 11: 'transient', 12: 'mri', 13: 'subsequent', 14: 'widest', 15: '___frontal', 16: 'subcm', 17: 'pleural-parenchymal', 18: 'inlet', 19: 'steroid', 20: 'changes/postoperative', 21: 'senile', 22: 'aneurysm', 23: 'addendum', 24: 'hernias', 25: 'artifact/hair', 26: 'adjacent/overlying', 27: 'atrioventricular', 28: 'stent', 29: 'pulse', 30: 't6', 31: 'evaluating', 32: 'at/just', 33: 'accumulated', 34: 'dots', 35: 'congestion/interstitial', 36: 'ctthere', 37: 'sitting', 38: 'reticulonudar', 39: 'wires/icd', 40: 'fissure/accessory', 41: 'impart', 42: 'locule', 43: 'explains', 44: 'perpendicular', 45: 'wall', 46: 'interposition', 47: 'indiscrete', 48: 'anterobasal', 49: 'entrance', 50: 'rhe', 51: 'obstruction', 52: 'engorgement', 53: 'intralobular', 54: 'spares', 55: 'plana', 56: 'infltrate', 57: 'anatomic', 58: 'ra/svcjunction', 59: 'crescent-

In [None]:
# todo: this: https://github.com/iffsid/mmvae/blob/public/src/datasets.py

In [4]:
class OrderedCounter(Counter, OrderedDict):
    """Counter that remembers the order elements are first encountered."""

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, OrderedDict(self))

    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)

In [5]:
class CUBSentences(Dataset):

    def __init__(self, root_data_dir: str, split: str, transform=None, **kwargs):
        """split: 'trainval' or 'test' """

        super().__init__()
        self.data_dir = os.path.join(root_data_dir, 'cub')
        self.split = split
        self.max_sequence_length = kwargs.get('max_sequence_length', 32)
        self.min_occ = kwargs.get('min_occ', 3)
        self.transform = transform
        os.makedirs(os.path.join(root_data_dir, "lang_emb"), exist_ok=True)

        self.gen_dir = os.path.join(self.data_dir, "oc:{}_msl:{}".
                                    format(self.min_occ, self.max_sequence_length))

        if split == 'train':
            self.raw_data_path = os.path.join(self.data_dir, 'text_trainvalclasses.txt')
        elif split == 'test':
            self.raw_data_path = os.path.join(self.data_dir, 'text_testclasses.txt')
        else:
            raise Exception("Only train or test split is available")

        os.makedirs(self.gen_dir, exist_ok=True)
        self.data_file = 'cub.{}.s{}'.format(split, self.max_sequence_length)
        self.vocab_file = 'cub.vocab'

        if not os.path.exists(os.path.join(self.gen_dir, self.data_file)):
            print("Data file not found for {} split at {}. Creating new... (this may take a while)".
                  format(split.upper(), os.path.join(self.gen_dir, self.data_file)))
            self._create_data()

        else:
            self._load_data()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sent = self.data[str(idx)]['idx']
        if self.transform is not None:
            sent = self.transform(sent)
        return sent, self.data[str(idx)]['length']

    @property
    def vocab_size(self):
        return len(self.w2i)

    @property
    def pad_idx(self):
        return self.w2i['<pad>']

    @property
    def eos_idx(self):
        return self.w2i['<eos>']

    @property
    def unk_idx(self):
        return self.w2i['<unk>']

    def get_w2i(self):
        return self.w2i

    def get_i2w(self):
        return self.i2w

    def _load_data(self, vocab=True):
        with open(os.path.join(self.gen_dir, self.data_file), 'rb') as file:
            self.data = json.load(file)

        if vocab:
            self._load_vocab()

    def _load_vocab(self):
        if not os.path.exists(os.path.join(self.gen_dir, self.vocab_file)):
            self._create_vocab()
        with open(os.path.join(self.gen_dir, self.vocab_file), 'r') as vocab_file:
            vocab = json.load(vocab_file)
        self.w2i, self.i2w = vocab['w2i'], vocab['i2w']

    def _create_data(self):
        if self.split == 'train' and not os.path.exists(os.path.join(self.gen_dir, self.vocab_file)):
            self._create_vocab()
        else:
            self._load_vocab()

        with open(self.raw_data_path, 'r') as file:
            text = file.read()
            sentences = sent_tokenize(text)

        data = defaultdict(dict)
        pad_count = 0

        for i, line in enumerate(sentences):
            words = word_tokenize(line)

            tok = words[:self.max_sequence_length - 1]
            tok = tok + ['<eos>']
            length = len(tok)
            if self.max_sequence_length > length:
                tok.extend(['<pad>'] * (self.max_sequence_length - length))
                pad_count += 1
            idx = [self.w2i.get(w, self.w2i['<exc>']) for w in tok]

            id = len(data)
            data[id]['tok'] = tok
            data[id]['idx'] = idx
            data[id]['length'] = length

        print("{} out of {} sentences are truncated with max sentence length {}.".
              format(len(sentences) - pad_count, len(sentences), self.max_sequence_length))
        with io.open(os.path.join(self.gen_dir, self.data_file), 'wb') as data_file:
            data = json.dumps(data, ensure_ascii=False)
            data_file.write(data.encode('utf8', 'replace'))

        self._load_data(vocab=False)

    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        with open(self.raw_data_path, 'r') as file:
            text = file.read()
            sentences = sent_tokenize(text)

        occ_register = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<exc>', '<pad>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        texts = []
        unq_words = []

        for i, line in enumerate(sentences):
            words = word_tokenize(line)
            occ_register.update(words)
            texts.append(words)

        for w, occ in occ_register.items():
            if occ > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)
            else:
                unq_words.append(w)

        assert len(w2i) == len(i2w)

        print("Vocablurary of {} keys created, {} words are excluded (occurrence <= {})."
              .format(len(w2i), len(unq_words), self.min_occ))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.gen_dir, self.vocab_file), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        with open(os.path.join(self.gen_dir, 'cub.unique'), 'wb') as unq_file:
            pickle.dump(np.array(unq_words), unq_file)

        with open(os.path.join(self.gen_dir, 'cub.all'), 'wb') as a_file:
            pickle.dump(occ_register, a_file)

        self._load_vocab()

In [6]:
tx = lambda data: torch.Tensor(data)
maxSentLen = 32
t_data = CUBSentences('', split='train', transform=tx, max_sequence_length=maxSentLen)

Data file not found for TRAIN split at cub/oc:3_msl:32/cub.train.s32. Creating new... (this may take a while)


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/Hendrik/nltk_data'
    - '/Users/Hendrik/opt/anaconda3/envs/mimic/nltk_data'
    - '/Users/Hendrik/opt/anaconda3/envs/mimic/share/nltk_data'
    - '/Users/Hendrik/opt/anaconda3/envs/mimic/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************
