<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Token Embedding
  </div> 
  
<div style="
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Clinical Trials CTTI
  </div> 



  <div style="
      font-size: 15px; 
      line-height: 12px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
  Jean-baptiste AUJOGUE
  </div> 


  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  December 2022
  </div>

<a id="TOC"></a>

#### Table Of Content

1. [Corpus](#data) <br>
2. [Word2Vec](#w2v) <br>

# Packages

[Back to top](#plan)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import time
import math
import re
import random
import pickle
import copy
from unidecode import unidecode
from itertools import chain
import multiprocessing

# data 
import numpy as np
import pandas as pd
from torch.utils.data import Dataset

# models
from transformers import AutoTokenizer
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from mittens import GloVe

# viz
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


#### Custom paths & imports

In [3]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'datasets')
path_to_save = os.path.join(path_to_repo, 'saves', 'MLM')
path_to_src  = os.path.join(path_to_repo, 'src')

#### Constants

In [None]:
dataset_name = 'clinical trials CTTI/clinical-trials-ctti'
base_model_name = os.path.join('albert-small-ctti', 'tokenizer')
final_model_name = os.path.join('albert-small-ctti', 'w2v')

<a id="data"></a>

# 1. Corpus

[Table of content](#TOC)

## 1.1 Load Clinical Trials corpus

[Table of content](#TOC)

In [5]:
with open(os.path.join(path_to_data, '{}.txt'.format(dataset_name)), 'r', encoding = 'utf-8') as f:
    texts = [t.strip() for t in f.readlines()]

In [6]:
len(texts)

430108

## 1.2 Tokenize corpus

[Table of content](#TOC)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path_to_save, base_model_name))

In [8]:
dataset = [tokenizer.tokenize(t) for t in tqdm(texts)]

100%|█████████████████████████████████████████████████████████████████████████| 430108/430108 [14:10<00:00, 505.84it/s]


In [9]:
dataset[0]

['▁this',
 '▁study',
 '▁will',
 '▁test',
 '▁the',
 '▁',
 'ability',
 '▁of',
 '▁extended',
 '▁release',
 '▁nifedip',
 'in',
 'e',
 '▁',
 '(',
 'pro',
 'cardia',
 '▁',
 'x',
 'l',
 ')',
 ',',
 '▁',
 'a',
 '▁blood',
 '▁pressure',
 '▁medication',
 ',',
 '▁to',
 '▁permit',
 '▁',
 'a',
 '▁decrease',
 '▁in',
 '▁the',
 '▁dose',
 '▁of',
 '▁glucocorticoid',
 '▁medication',
 '▁children',
 '▁take',
 '▁to',
 '▁treat',
 '▁congenital',
 '▁adrenal',
 '▁hyperplasia',
 '▁',
 '(',
 'c',
 'a',
 'h',
 ')',
 '.',
 '▁this',
 '▁protocol',
 '▁is',
 '▁',
 'designed',
 '▁to',
 '▁assess',
 '▁both',
 '▁acute',
 '▁and',
 '▁chronic',
 '▁effects',
 '▁of',
 '▁the',
 '▁calcium',
 '▁channel',
 '▁antagonist',
 ',',
 '▁nifedip',
 'in',
 'e',
 ',',
 '▁on',
 '▁the',
 '▁hypothalamic',
 '-',
 'pituitary',
 '-',
 'a',
 'd',
 'renal',
 '▁axis',
 '▁in',
 '▁patients',
 '▁with',
 '▁congenital',
 '▁adrenal',
 '▁hyperplasia',
 '.',
 '▁the',
 '▁multicenter',
 '▁trial',
 '▁is',
 '▁compose',
 'd',
 '▁of',
 '▁two',
 '▁phases',
 '▁and',


<a id="w2v"></a>


# 2. Word2Vec with Skip-Gram training objective

[Table of content](#TOC)

In [22]:
sgram = Word2Vec(
    vector_size = 128, 
    window = 7, 
    min_count = 0, 
    negative = 15, 
    sg = 1,
    workers = multiprocessing.cpu_count(),
    seed = 42,
)

In [23]:
sgram.build_vocab([list(tokenizer.get_vocab())] + dataset)

In [24]:
base_vocab  = tokenizer.get_vocab()
sgram_vocab = sgram.wv.key_to_index

len(base_vocab), len(sgram_vocab), (set(sgram_vocab.keys()) == set(base_vocab.keys()))

(10000, 10000, True)

In [25]:
sgram.train(
    corpus_iterable = dataset, 
    epochs = 10, 
    total_examples = len(dataset),
    start_alpha = 2.5e-2,
    end_alpha = 1e-5,
)

(1457092676, 2196382400)

In [26]:
sgram.save(os.path.join(path_to_save, final_model_name, 'sgram'))

Evaluation

In [27]:
sgram = Word2Vec.load(os.path.join(path_to_save, final_model_name, 'sgram'))

In [28]:
np.linalg.norm(sgram.wv.vectors, axis = -1).tolist()

[1.4907070398330688,
 1.6635342836380005,
 1.7196722030639648,
 1.48988938331604,
 1.8163033723831177,
 1.79470956325531,
 1.6517850160598755,
 1.6456834077835083,
 2.0286712646484375,
 2.1574385166168213,
 1.897613525390625,
 2.019554615020752,
 1.7978315353393555,
 1.7077692747116089,
 2.109459638595581,
 1.933674693107605,
 1.8845109939575195,
 2.3361778259277344,
 1.9730669260025024,
 2.210378408432007,
 2.3938069343566895,
 1.9208117723464966,
 2.128443717956543,
 1.9382851123809814,
 2.221428155899048,
 2.3255093097686768,
 1.949000597000122,
 2.009927749633789,
 2.269688367843628,
 2.2436838150024414,
 2.074592351913452,
 1.938167691230774,
 1.9612109661102295,
 1.9347811937332153,
 2.2629809379577637,
 2.143207550048828,
 2.1711978912353516,
 2.2300586700439453,
 2.0631351470947266,
 2.0664076805114746,
 2.1501216888427734,
 2.4426355361938477,
 2.367422103881836,
 2.2790002822875977,
 2.2468159198760986,
 2.074310779571533,
 2.3314919471740723,
 2.101431369781494,
 2.074890375

In [29]:
sgram.wv.most_similar('ability')

[('able', 0.6986899375915527),
 ('▁unable', 0.629711925983429),
 ('▁comprehend', 0.5709015130996704),
 ('▁capabilit', 0.5701678991317749),
 ('▁willing', 0.5700328946113586),
 ('▁unwilling', 0.5630849599838257),
 ('▁impossibilit', 0.5611531734466553),
 ('▁understand', 0.5533663034439087),
 ('ness', 0.536178469657898),
 ('▁capacit', 0.5293818712234497)]

In [30]:
sgram.wv.most_similar('▁glucocorticoid')

[('▁corticosteroids', 0.8446471095085144),
 ('steroids', 0.8386294841766357),
 ('▁corticosteroid', 0.8272319436073303),
 ('▁steroid', 0.8144977688789368),
 ('▁bisphosphonate', 0.7192035913467407),
 ('▁prednisolone', 0.6764853000640869),
 ('▁nsaids', 0.6602618098258972),
 ('▁statin', 0.6520041823387146),
 ('mmunosuppressant', 0.6507323384284973),
 ('▁retinoid', 0.6436989903450012)]

In [31]:
sgram.wv.most_similar('▁ibuprofen')

[('▁acetaminophen', 0.9028555154800415),
 ('▁tramadol', 0.787520706653595),
 ('▁gabapentin', 0.771825909614563),
 ('▁pregabalin', 0.7492136359214783),
 ('▁diclofenac', 0.739313542842865),
 ('acetaminophen', 0.7373422980308533),
 ('▁indomethacin', 0.7343679666519165),
 ('▁oxycodon', 0.7338364720344543),
 ('▁ketorol', 0.7164152264595032),
 ('coxib', 0.6944490075111389)]

[Table of content](#TOC)