<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Token Embedding
  </div> 
  
<div style="
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Clinical Trials CTTI + ICTRP
  </div> 



  <div style="
      font-size: 15px; 
      line-height: 12px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
  Jean-baptiste AUJOGUE
  </div> 


  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  December 2022
  </div>

<a id="TOC"></a>

#### Table Of Content

1. [Corpus](#data) <br>
2. [Word2Vec](#w2v) <br>

# Packages

[Back to top](#plan)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import time
import math
import re
import random
import pickle
import copy
from unidecode import unidecode
from itertools import chain
import multiprocessing

# data 
import numpy as np
import pandas as pd
from torch.utils.data import Dataset

# models
from transformers import AutoTokenizer
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from mittens import GloVe

# viz
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


#### Custom paths & imports

In [3]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'datasets')
path_to_save = os.path.join(path_to_repo, 'saves', 'MLM')
path_to_src  = os.path.join(path_to_repo, 'src')

#### Constants

In [4]:
dataset_name_ctti = 'clinical trials CTTI/clinical-trials-ctti'
dataset_name_ictrp = 'clinical trials ICTRP/clinical-trials-ictrp'
final_dataset_name = os.path.join('albert-small-ctti+ictrp', 'corpus-tokenized')
base_model_name = os.path.join('albert-small-ctti+ictrp', 'tokenizer')
final_model_name = os.path.join('albert-small-ctti+ictrp', 'w2v')

<a id="data"></a>

# 1. Corpus

[Table of content](#TOC)

## 1.1 Create Clinical Trials corpus with on-the-fly tokenization

[Table of content](#TOC)

In [5]:
# load texts on RAM, and subsequently perform tokenization on-the-fly when data is streamed
class CustomTextDataset(Dataset):
    def __init__(self, text_files, tokenizer):
        self.texts = self._load_texts(text_files)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)
    
    def _load_texts(self, text_files):
        texts = []
        for text_file in text_files:
            with open(os.path.join(path_to_data, '{}.txt'.format(text_file)), 'r', encoding = 'utf-8') as f:
                texts += [t.strip() for i, t in enumerate(f.readlines()) if i < 500]
                # texts += [t.strip() for t in f.readlines()]
        return texts

    def __getitem__(self, idx):
        return self.tokenizer.tokenize(self.texts[idx])

    
    
# stream data and run tokenization
# works so-so
# potentially cut sentences at the middle, forming undesired tokens
def load_and_tokenize_texts(text_files, tokenizer):
    def readInChunks(fileObj, chunkSize=2048):
        while True:
            data = fileObj.read(chunkSize)
            if not data:
                break
            yield data

    texts = []
    for text_file in text_files:
        with open(os.path.join(path_to_data, '{}.txt'.format(text_file)), 'r', encoding = 'utf-8') as f:
            for chunk in tqdm(readInChunks(f, chunkSize = 10000000)):
                texts += [tokenizer.tokenize(t.strip()) for t in chunk.split('\n')]
    return texts

In [6]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path_to_save, base_model_name))

In [7]:
# dataset = load_and_tokenize_texts([dataset_name_ctti, dataset_name_ictrp], tokenizer)

In [8]:
dataset = CustomTextDataset([dataset_name_ctti, dataset_name_ictrp], tokenizer)

In [9]:
dataset[0]

['▁this',
 '▁study',
 '▁will',
 '▁test',
 '▁the',
 '▁',
 'ability',
 '▁of',
 '▁extended',
 '▁release',
 '▁',
 'n',
 'i',
 'f',
 'e',
 'd',
 'i',
 'pine',
 '▁',
 '(',
 'pro',
 'cardia',
 '▁',
 'x',
 'l',
 ')',
 ',',
 '▁',
 'a',
 '▁blood',
 '▁pressure',
 '▁medication',
 ',',
 '▁to',
 '▁permit',
 '▁',
 'a',
 '▁decrease',
 '▁in',
 '▁the',
 '▁dose',
 '▁of',
 '▁glucocorticoid',
 '▁medication',
 '▁children',
 '▁take',
 '▁to',
 '▁treat',
 '▁congenital',
 '▁adrenal',
 '▁hyperplasia',
 '▁',
 '(',
 'c',
 'a',
 'h',
 ')',
 '.',
 '▁this',
 '▁protocol',
 '▁is',
 '▁',
 'designed',
 '▁to',
 '▁assess',
 '▁both',
 '▁acute',
 '▁and',
 '▁chronic',
 '▁effects',
 '▁of',
 '▁the',
 '▁calcium',
 '▁channel',
 '▁antagonist',
 ',',
 '▁',
 'n',
 'i',
 'f',
 'e',
 'd',
 'i',
 'pine',
 ',',
 '▁',
 'o',
 'n',
 '▁the',
 '▁hypo',
 't',
 'hal',
 'a',
 'mic',
 '-',
 'pituitary',
 '-',
 'a',
 'd',
 'renal',
 '▁axis',
 '▁in',
 '▁patients',
 '▁with',
 '▁congenital',
 '▁adrenal',
 '▁hyperplasia',
 '.',
 '▁the',
 '▁multicente

In [21]:
# lol = iter(dataset)

for i, a in enumerate(lol):
    print(a)
    if i == 1:
        break

['▁adults', '▁with', '▁', 'c', 'y', 'a', 'n', 'otic', '▁congenital', '▁heart', '▁disease', '▁have', '▁elevated', '▁levels', '▁of', '▁plasma', '▁pro', 'a', 'trial', '▁', 'n', 'a', 't', 'r', 'ure', 'tic', '▁peptide', '▁', '(', 'pro', 'a', 'n', 'p', ')', '▁which', '▁most', '▁likely', '▁results', '▁in', '▁chronic', '▁dehydration', ',', '▁leading', '▁to', '▁reduced', '▁oxygen', '▁transport', '▁to', '▁tissues', '▁and', '▁short', 'ness', '▁of', '▁breath', '▁with', '▁activity', '.', '▁the', '▁purpose', '▁of', '▁this', '▁study', '▁is', '▁to', '▁characterize', '▁adults', '▁with', '▁', 'c', 'y', 'a', 'n', 'otic', '▁congenital', '▁heart', '▁defects', '▁with', '▁respect', '▁to', '▁their', '▁body', '▁composition', '▁', '(', 'w', 'ate', 'r', '▁and', '▁fat', '-', 'free', '▁mass', ')', '▁and', '▁resting', '▁metabolic', '▁rates', '.', '▁the', '▁study', '▁consists', '▁of', '▁several', '▁measures', '▁of', '▁how', '▁much', '▁body', '▁water', ',', '▁fat', '▁and', '▁', 'l', 'e', 'a', 'n', '▁tissue', '▁', 'a'

<a id="w2v"></a>


# 2. Word2Vec with Skip-Gram training objective

[Table of content](#TOC)

In [12]:
sgram = Word2Vec(
    vector_size = 128, 
    window = 7, 
    min_count = 0, 
    negative = 15, 
    sg = 1,
    workers = multiprocessing.cpu_count(),
    seed = 42,
)

In [13]:
sgram.build_vocab(chain([list(tokenizer.get_vocab())], dataset))

In [14]:
base_vocab  = tokenizer.get_vocab()
sgram_vocab = sgram.wv.key_to_index

len(base_vocab), len(sgram_vocab), (set(sgram_vocab.keys()) == set(base_vocab.keys()))

(10000, 10000, True)

In [None]:
sgram.train(
    corpus_iterable = chain(iter(dataset), iter(dataset), iter(dataset)), 
    epochs = 1, 
    total_examples = len(dataset),
    start_alpha = 2.5e-2,
    end_alpha = 1e-5,
)

Exception in thread Thread-140:
Traceback (most recent call last):
  File "C:\Users\jb\miniconda3\envs\transformers_nlp\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\Users\jb\miniconda3\envs\transformers_nlp\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\jb\miniconda3\envs\transformers_nlp\lib\site-packages\gensim\models\word2vec.py", line 1200, in _job_producer
    data_length = self._raw_word_count([data])
  File "C:\Users\jb\miniconda3\envs\transformers_nlp\lib\site-packages\gensim\models\word2vec.py", line 1489, in _raw_word_count
    return sum(len(sentence) for sentence in job)
  File "C:\Users\jb\miniconda3\envs\transformers_nlp\lib\site-packages\gensim\models\word2vec.py", line 1489, in <genexpr>
    return sum(len(sentence) for sentence in job)
TypeError: object of type 'iterator' has no len()


In [16]:
sgram.save(os.path.join(path_to_save, final_model_name, 'sgram'))

Evaluation

In [16]:
sgram = Word2Vec.load(os.path.join(path_to_save, final_model_name, 'sgram'))

In [17]:
np.linalg.norm(sgram.wv.vectors, axis = -1).tolist()

[2.3100714683532715,
 2.4473042488098145,
 2.5080461502075195,
 2.58563232421875,
 2.4587035179138184,
 2.328742742538452,
 2.6131718158721924,
 2.1582517623901367,
 2.2624194622039795,
 2.465782880783081,
 2.624119997024536,
 2.4085259437561035,
 2.490738868713379,
 2.342733860015869,
 2.480290651321411,
 2.6105635166168213,
 2.535289764404297,
 2.4986307621002197,
 2.3607184886932373,
 2.427175998687744,
 2.5645058155059814,
 2.610975742340088,
 2.43310284614563,
 2.4615046977996826,
 2.2460947036743164,
 2.314279079437256,
 2.9211251735687256,
 2.891680955886841,
 2.6301400661468506,
 2.714359998703003,
 2.744889736175537,
 2.707303285598755,
 2.590636968612671,
 2.6204240322113037,
 2.924564838409424,
 2.691087245941162,
 2.5775065422058105,
 2.553255319595337,
 2.844831705093384,
 2.426478147506714,
 2.633009195327759,
 2.8782989978790283,
 2.649277448654175,
 2.9100775718688965,
 2.473799467086792,
 2.8158118724823,
 2.2401747703552246,
 2.854246139526367,
 2.7214550971984863,
 2

In [18]:
sgram.wv.most_similar('ability')

[('▁impair', 0.624174952507019),
 ('▁willingness', 0.5964481234550476),
 ('▁impact', 0.5906426310539246),
 ('▁unwillingness', 0.5900910496711731),
 ('▁cooperate', 0.5784739255905151),
 ('▁comprehend', 0.5775901675224304),
 ('▁hinder', 0.5726792812347412),
 ('▁impede', 0.5725922584533691),
 ('▁comply', 0.5644869804382324),
 ('▁swallow', 0.5605961084365845)]

In [19]:
sgram.wv.most_similar('▁glucocorticoid')

[('▁steroid', 0.738760769367218),
 ('▁corticosteroid', 0.7251085042953491),
 ('▁corticosteroids', 0.7001180648803711),
 ('▁glucocorticoids', 0.6617134213447571),
 ('▁steroids', 0.639555037021637),
 ('corticosteroid', 0.6349804997444153),
 ('▁inhaled', 0.6256146430969238),
 ('▁bisphosphonate', 0.6126328110694885),
 ('steroid', 0.6027469038963318),
 ('immunosuppressiv', 0.5958155989646912)]

In [20]:
sgram.wv.most_similar('▁ibuprofen')

[('▁paracetamol', 0.8182331323623657),
 ('▁acetaminophen', 0.81548011302948),
 ('▁naproxen', 0.7549471855163574),
 ('▁indomethacin', 0.7525111436843872),
 ('paracetamol', 0.7484970688819885),
 ('acetaminophen', 0.7359393835067749),
 ('▁tramadol', 0.7208881974220276),
 ('▁ketorolac', 0.7119197845458984),
 ('▁celecoxib', 0.6817026138305664),
 ('▁diclofenac', 0.6804940700531006)]