<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Token Embedding
  </div> 
  
<div style="
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Clinical Trials ICTRP
  </div> 



  <div style=" float:left; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jean-baptiste AUJOGUE
  </div> 
  
  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  December 2022
  </div>

<a id="TOC"></a>

#### Table Of Content

1. [Corpus](#data) <br>
2. [Word2Vec](#w2v) <br>

# Packages

[Back to top](#plan)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import time
import math
import re
import random
import pickle
import copy
from unidecode import unidecode
from itertools import chain
import multiprocessing

# data 
import numpy as np
import pandas as pd
from torch.utils.data import Dataset

# models
from transformers import AutoTokenizer
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from mittens import GloVe

# viz
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


#### Custom paths & imports

In [3]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'datasets')
path_to_save = os.path.join(path_to_repo, 'saves', 'MLM')
path_to_src  = os.path.join(path_to_repo, 'src')

#### Constants

In [4]:
dataset_name = 'clinical trials ICTRP/clinical-trials-ictrp'
base_model_name = os.path.join('albert-small-ictrp', 'tokenizer')
final_model_name = os.path.join('albert-small-ictrp', 'w2v')

<a id="data"></a>

# 1. Corpus

[Table of content](#TOC)

## 1.1 Load Clinical Trials corpus

[Table of content](#TOC)

In [5]:
with open(os.path.join(path_to_data, '{}.txt'.format(dataset_name)), 'r', encoding = 'utf-8') as f:
    texts = [t.strip() for t in f.readlines()]

In [6]:
len(texts)

1081670

## 1.2 Tokenize corpus

[Table of content](#TOC)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path_to_save, base_model_name))

In [8]:
dataset = [tokenizer.tokenize(t) for t in tqdm(texts)]

100%|███████████████████████████████████████████████████████████████████████| 1081670/1081670 [21:07<00:00, 853.37it/s]


In [9]:
dataset[0]

['▁capable',
 '▁of',
 '▁giving',
 '▁signed',
 '▁informed',
 '▁consent',
 '.',
 '▁has',
 '▁received',
 ',',
 '▁been',
 '▁intolerant',
 '▁to',
 ',',
 '▁or',
 '▁been',
 '▁ineligible',
 '▁for',
 '▁all',
 '▁treatment',
 '▁options',
 '▁proven',
 ',',
 '▁to',
 '▁confer',
 '▁clinical',
 '▁benefit',
 '.',
 '▁measurable',
 '▁disease',
 '▁per',
 '▁response',
 '▁evaluation',
 '▁criteria',
 '▁in',
 '▁solid',
 '▁tumors',
 '▁',
 '(',
 'recist',
 ')',
 '▁',
 '1',
 '.',
 '1',
 '.',
 '▁eastern',
 '▁cooperative',
 '▁oncology',
 '▁group',
 '▁',
 '(',
 'ecog',
 ')',
 '▁performance',
 '▁status',
 '▁',
 '(',
 'p',
 's',
 ')',
 '▁of',
 '▁',
 '0',
 '▁or',
 '▁',
 '1',
 '.',
 '▁adequate',
 '▁organ',
 '▁function',
 '.',
 '▁male',
 '▁individuals',
 '▁and',
 '▁female',
 '▁individuals',
 '▁of',
 '▁childbearing',
 '▁potential',
 '▁who',
 '▁engage',
 '▁in',
 ',',
 '▁heterosexual',
 '▁intercourse',
 '▁must',
 '▁agree',
 '▁to',
 '▁use',
 '▁methods',
 '▁of',
 '▁contraception',
 '.',
 '▁female',
 '▁participants',
 '▁are',

<a id="w2v"></a>


# 2. Word2Vec with Skip-Gram training objective

[Table of content](#TOC)

In [10]:
sgram = Word2Vec(
    vector_size = 128, 
    window = 7, 
    min_count = 0, 
    negative = 15, 
    sg = 1,
    workers = multiprocessing.cpu_count(),
    seed = 42,
)

In [11]:
sgram.build_vocab([list(tokenizer.get_vocab())] + dataset)

In [12]:
base_vocab  = tokenizer.get_vocab()
sgram_vocab = sgram.wv.key_to_index

len(base_vocab), len(sgram_vocab), (set(sgram_vocab.keys()) == set(base_vocab.keys()))

(15000, 15000, True)

In [13]:
sgram.train(
    corpus_iterable = dataset, 
    epochs = 3, 
    total_examples = len(dataset),
    start_alpha = 2.5e-2,
    end_alpha = 1e-5,
)

(600782472, 908639268)

In [14]:
sgram.save(os.path.join(path_to_save, final_model_name, 'sgram'))

Evaluation

In [15]:
sgram = Word2Vec.load(os.path.join(path_to_save, final_model_name, 'sgram'))

In [16]:
np.linalg.norm(sgram.wv.vectors, axis = -1).tolist()

[1.7227306365966797,
 1.8393648862838745,
 1.9135942459106445,
 2.096674919128418,
 1.9929710626602173,
 1.8200165033340454,
 1.8425917625427246,
 2.1484005451202393,
 2.278388261795044,
 1.9871857166290283,
 2.1129376888275146,
 2.4063568115234375,
 1.9516831636428833,
 2.530545473098755,
 2.0679008960723877,
 2.2181642055511475,
 2.5778214931488037,
 2.1268270015716553,
 2.284655809402466,
 2.182380199432373,
 2.623713254928589,
 2.0911061763763428,
 2.6132137775421143,
 2.2124714851379395,
 1.9270639419555664,
 2.1363606452941895,
 2.244401216506958,
 2.4619646072387695,
 2.2495038509368896,
 2.169473886489868,
 2.8109073638916016,
 2.4892654418945312,
 3.243286609649658,
 2.5276334285736084,
 2.607361078262329,
 3.1745095252990723,
 2.2061665058135986,
 2.8597164154052734,
 2.06711483001709,
 2.1849875450134277,
 2.6179184913635254,
 2.3931705951690674,
 2.382831573486328,
 1.8428137302398682,
 3.8257932662963867,
 2.5819013118743896,
 2.481210231781006,
 3.314995765686035,
 2.4382

In [17]:
sgram.wv.most_similar('ability')

[('▁suit', 0.607435941696167),
 ('hipaa', 0.5790230631828308),
 ('▁account', 0.5598636865615845),
 ('▁accept', 0.5549246072769165),
 ('willingness', 0.5481088161468506),
 ('able', 0.546738862991333),
 ('▁ability', 0.5212465524673462),
 ('▁authorization', 0.5116440653800964),
 ('inability', 0.5065059661865234),
 ('▁unsuit', 0.5012784004211426)]

In [18]:
sgram.wv.most_similar('▁ability')

[('▁capabilit', 0.6521344780921936),
 ('▁adherence', 0.6283731460571289),
 ('▁inability', 0.623848021030426),
 ('▁willingness', 0.6222561001777649),
 ('▁abilities', 0.6123645901679993),
 ('▁understand', 0.6104001402854919),
 ('able', 0.6043904423713684),
 ('▁cooperate', 0.5877724885940552),
 ('▁unable', 0.577839732170105),
 ('▁compromise', 0.5760576725006104)]

In [19]:
sgram.wv.most_similar('▁glucocorticoid')

[('▁corticosteroid', 0.8717933297157288),
 ('▁steroid', 0.8510860800743103),
 ('▁corticosteroids', 0.8165186643600464),
 ('▁glucocorticoids', 0.8050146698951721),
 ('▁steroids', 0.7718686461448669),
 ('▁corticosteriod', 0.7310996055603027),
 ('corticosteroids', 0.7232717871665955),
 ('prednisone', 0.7184717059135437),
 ('▁cortisone', 0.7105651497840881),
 ('corticosteroid', 0.6903935670852661)]

In [20]:
sgram.wv.most_similar('▁ibuprofen')

[('▁naproxen', 0.8725085258483887),
 ('▁acetaminophen', 0.8494886159896851),
 ('▁paracetamol', 0.8399044275283813),
 ('▁celecoxib', 0.7945672869682312),
 ('▁aspirin', 0.7849840521812439),
 ('▁nsaids', 0.7812800407409668),
 ('▁indomethacin', 0.7700745463371277),
 ('ibuprofen', 0.7685044407844543),
 ('tylenol', 0.7618494033813477),
 ('paracetamol', 0.753754734992981)]

In [21]:
sgram.wv.most_similar('▁paracetamol')

[('▁acetaminophen', 0.9163859486579895),
 ('paracetamol', 0.8523569703102112),
 ('acetaminophen', 0.8480719327926636),
 ('▁ibuprofen', 0.8399044275283813),
 ('tylenol', 0.7450748085975647),
 ('▁nsaids', 0.7364432215690613),
 ('▁tramadol', 0.72121661901474),
 ('▁naproxen', 0.716267466545105),
 ('▁acetylsalicylic', 0.7008892297744751),
 ('oxycodone', 0.6847322583198547)]

[Table of content](#TOC)