<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Token Embedding
  </div> 
  
<div style="
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Clinical Trials CTTI
  </div> 



  <div style="
      font-size: 15px; 
      line-height: 12px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
  Jean-baptiste AUJOGUE
  </div> 


  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  December 2022
  </div>

<a id="TOC"></a>

#### Table Of Content

1. [Corpus](#data) <br>
2. [Word2Vec](#w2v) <br>

# Packages

[Back to top](#plan)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import time
import math
import re
import random
import pickle
import copy
from unidecode import unidecode
from itertools import chain
import multiprocessing

# data 
import numpy as np
import pandas as pd
from torch.utils.data import Dataset

# models
from transformers import AutoTokenizer
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from mittens import GloVe

# viz
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


#### Custom paths & imports

In [3]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'datasets')
path_to_save = os.path.join(path_to_repo, 'saves', 'MLM')
path_to_src  = os.path.join(path_to_repo, 'src')

#### Constants

In [4]:
dataset_name_ctti = 'clinical trials CTTI/clinical-trials-ctti'
dataset_name_ictrp = 'clinical trials ICTRP/clinical-trials-ictrp'
final_dataset_name = 'clinical-trials-ctti+ictrp-tokenized'
base_model_name = os.path.join('albert-small-ctti+ictrp', 'tokenizer')
final_model_name = os.path.join('albert-small-ctti+ictrp', 'w2v')

<a id="data"></a>

# 1. Corpus

[Table of content](#TOC)

## 1.1 Load and tokenize Clinical Trials corpus

[Table of content](#TOC)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path_to_save, base_model_name))

In [6]:
with open(os.path.join(path_to_data, '{}.txt'.format(dataset_name_ctti)), 'r', encoding = 'utf-8') as f:
    dataset = [tokenizer.tokenize(t.strip()) for t in tqdm(f.readlines())]

100%|█████████████████████████████████████████████████████████████████████████| 430108/430108 [12:41<00:00, 564.62it/s]


In [7]:
with open(os.path.join(path_to_data, '{}.txt'.format(dataset_name_ictrp)), 'r', encoding = 'utf-8') as f:
    for t in tqdm(f.readlines()):
        dataset.append(tokenizer.tokenize(t.strip()))

100%|█████████████████████████████████████████████████████████████████████████| 774632/774632 [29:41<00:00, 434.93it/s]


In [8]:
with open(os.path.join(path_to_data, '{}.txt'.format(final_dataset_name)), 'w', encoding = 'utf-8') as f:
    for t in tqdm(dataset):
        f.write(' '.join(t) + '\n')

100%|██████████████████████████████████████████████████████████████████████| 1204740/1204740 [04:04<00:00, 4930.51it/s]


In [6]:
with open(os.path.join(path_to_data, '{}.txt'.format(final_dataset_name)), 'r', encoding = 'utf-8') as f:
    dataset = [t.split(' ') for t in f.readlines()]

In [9]:
len(dataset)

1204740

In [10]:
random.Random(42).shuffle(dataset)

In [11]:
dataset[0]

['▁autologous',
 '▁platelet',
 '-',
 'secret',
 'e',
 'd',
 '▁growth',
 '▁factors',
 '▁',
 '(',
 'g',
 'f',
 's',
 ')',
 '▁may',
 '▁have',
 '▁therapeutic',
 '▁effects',
 '▁in',
 '▁osteoarthritis',
 '▁',
 '(',
 'o',
 'a',
 ')',
 '▁cap',
 's',
 'ular',
 '▁joints',
 '▁via',
 '▁multiple',
 '▁mechanisms',
 '.',
 '▁the',
 '▁aim',
 '▁is',
 '▁to',
 '▁examine',
 '▁the',
 '▁effect',
 '▁of',
 '▁',
 'a',
 '▁platelet',
 '-',
 'derived',
 '▁preparation',
 '▁',
 'rich',
 '▁in',
 '▁growth',
 '▁factors',
 '▁',
 '(',
 'p',
 'r',
 'g',
 'f',
 's',
 ')',
 '▁in',
 '▁',
 'o',
 'a',
 '▁of',
 '▁the',
 '▁knee',
 '.',
 '▁autologous',
 '▁platelet',
 '-',
 'secret',
 'e',
 'd',
 '▁growth',
 '▁factors',
 '▁',
 '(',
 'g',
 'f',
 's',
 ')',
 '▁may',
 '▁have',
 '▁therapeutic',
 '▁effects',
 '▁in',
 '▁osteoarthritis',
 '▁',
 '(',
 'o',
 'a',
 ')',
 '▁cap',
 's',
 'ular',
 '▁joints',
 '▁via',
 '▁multiple',
 '▁mechanisms',
 '.',
 '▁the',
 '▁investigators',
 '▁aim',
 '▁is',
 '▁to',
 '▁examine',
 '▁the',
 '▁effect',
 '▁of

<a id="w2v"></a>


# 2. Word2Vec with Skip-Gram training objective

[Table of content](#TOC)

In [12]:
sgram = Word2Vec(
    vector_size = 128, 
    window = 7, 
    min_count = 0, 
    negative = 15, 
    sg = 1,
    workers = multiprocessing.cpu_count(),
    seed = 42,
)

In [13]:
sgram.build_vocab([list(tokenizer.get_vocab())] + dataset)

In [14]:
base_vocab  = tokenizer.get_vocab()
sgram_vocab = sgram.wv.key_to_index

len(base_vocab), len(sgram_vocab), (set(sgram_vocab.keys()) == set(base_vocab.keys()))

(10000, 10000, True)

In [15]:
sgram.train(
    corpus_iterable = dataset, 
    epochs = 3, 
    total_examples = len(dataset),
    start_alpha = 2.5e-2,
    end_alpha = 1e-5,
)

(1064327670, 1604217048)

In [16]:
sgram.save(os.path.join(path_to_save, final_model_name, 'sgram'))

Evaluation

In [17]:
sgram = Word2Vec.load(os.path.join(path_to_save, final_model_name, 'sgram'))

In [18]:
np.linalg.norm(sgram.wv.vectors, axis = -1).tolist()

[1.661415934562683,
 1.8044352531433105,
 1.7737877368927002,
 1.731158971786499,
 1.9444323778152466,
 1.747165560722351,
 2.1701714992523193,
 2.172295093536377,
 1.756604790687561,
 1.8055170774459839,
 1.8654581308364868,
 2.2003121376037598,
 2.1785576343536377,
 1.9131866693496704,
 1.9558976888656616,
 1.9573966264724731,
 1.889706015586853,
 2.005681276321411,
 1.9547982215881348,
 2.0792462825775146,
 2.199103832244873,
 1.9541656970977783,
 2.628870725631714,
 2.1095569133758545,
 2.3959546089172363,
 2.6430001258850098,
 2.001065731048584,
 2.2410402297973633,
 2.296562671661377,
 2.267765998840332,
 2.3426425457000732,
 2.320300817489624,
 2.2782065868377686,
 2.3724582195281982,
 2.424999952316284,
 2.2701351642608643,
 2.555596351623535,
 2.0512855052948,
 2.2812366485595703,
 2.2759218215942383,
 2.2804629802703857,
 2.042482614517212,
 2.0761919021606445,
 2.4718127250671387,
 2.382612943649292,
 2.8160367012023926,
 3.156343460083008,
 2.8419461250305176,
 2.4858782291

In [19]:
sgram.wv.most_similar('ability')

[('▁willingness', 0.6743863224983215),
 ('able', 0.6733582615852356),
 ('▁unable', 0.6733198165893555),
 ('▁unwillingness', 0.6716748476028442),
 ('capacity', 0.6055564284324646),
 ('▁understand', 0.5387355089187622),
 ('ibility', 0.5354493856430054),
 ('▁proficiency', 0.534579873085022),
 ('▁comprehend', 0.5335730910301208),
 ('▁give', 0.5327818989753723)]

In [20]:
sgram.wv.most_similar('▁glucocorticoid')

[('▁steroid', 0.8789478540420532),
 ('▁corticosteroid', 0.8772727251052856),
 ('▁glucocorticoids', 0.832581639289856),
 ('▁corticosteroids', 0.8273091912269592),
 ('▁steroids', 0.8083666563034058),
 ('▁cortisone', 0.7489993572235107),
 ('corticosteroid', 0.7333958148956299),
 ('corticoid', 0.7296726703643799),
 ('▁prednisolone', 0.7218935489654541),
 ('prednisone', 0.7129298448562622)]

In [21]:
sgram.wv.most_similar('▁ibuprofen')

[('▁acetaminophen', 0.8838450312614441),
 ('▁naproxen', 0.8788578510284424),
 ('▁paracetamol', 0.8755892515182495),
 ('▁tramadol', 0.7887517809867859),
 ('▁ketorolac', 0.7791204452514648),
 ('paracetamol', 0.7788904309272766),
 ('▁aspirin', 0.767105758190155),
 ('acetaminophen', 0.7649925947189331),
 ('▁diclofenac', 0.7593932151794434),
 ('▁indomethacin', 0.7480955719947815)]

[Table of content](#TOC)