In [1]:
from jax_mlm_helpers import build_vocabulary
from jax_mlm_helpers import apply_random_masking
from jax_mlm_helpers import pad_and_crop_to_maximum_length
from utils_display import pc

In [2]:
import spacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
from transformers import AutoTokenizer

In [3]:
with open("local_datasets/wikipedia_man_o_war.txt", "r") as fid:
    text = fid.read()

In [4]:
text = text.replace("\n", " ")
while "  " in text:
    text = text.replace("  ", " ")

In [5]:
sentences = [i for i in nlp(text).sents]
for i, s in enumerate(sentences):
    pc(i, s)

[34m0[0m: Portuguese man o' war
[34m1[0m: The Portuguese man o' war (Physalia physalis), also known as the man-of-war or bluebottle, is a marine hydrozoan found in the Atlantic, Indian, and Pacific oceans.
[34m2[0m: While it is typically considered the only species in its genus, Physalia, and family, Physaliidae, genetic evidence suggests there may be more.
[34m3[0m: Although it superficially resembles a jellyfish, the Portuguese man o' war is in fact a siphonophore.
[34m4[0m: Like all siphonophores, it is a colonial organism, made up of many smaller units called zooids.
[34m5[0m: Although they are morphologically quite different, all of the zooids in a single specimen are genetically identical.
[34m6[0m: These different types of zooids fulfill specialized functions, such as hunting, digestion and reproduction, and together they allow the colony to operate as a single individual.
[34m7[0m: The man o' war is part of the neuston, organisms that live on the surface of the 

In [6]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokens = []
for s in sentences:        
    tokens.append(tokenizer.tokenize(str(s)))
print(tokens[0])

['portuguese', 'man', 'o', "'", 'war']


In [7]:
dico_word2index, dico_index2word = build_vocabulary(texts=tokens)

In [8]:
words_frequency = {}
for sentence in sentences:
    for word in sentence:
        
        if str(word) in words_frequency:
            words_frequency[str(word)] += 1
        else:
            words_frequency[str(word)] = 1

words_frequency = sorted(words_frequency.items(), key=lambda x:x[1], reverse=True)

number_of_examples = 20
pc("{} most frequent words".format(number_of_examples), words_frequency[:number_of_examples])
pc("{} least frequent words".format(number_of_examples), words_frequency[-number_of_examples:])

[34m20 most frequent words[0m: [('the', 209), (',', 177), ('.', 134), ('of', 102), ('and', 85), ('to', 67), ('a', 65), ('in', 57), ('war', 49), ('is', 47), ('man', 46), ('o', 46), ("'", 45), ('The', 43), ('as', 31), ('-', 29), ('by', 29), ('(', 28), (')', 28), ('or', 27)]
[34m20 least frequent words[0m: [('assumptions', 1), ('calculate', 1), ('excluding', 1), ('2021', 1), ('Lee', 1), ('provide', 1), ('parameterisation', 1), ('considering', 1), ('similarities', 1), ('sailboat', 1), ('allowed', 1), ('compute', 1), ('hydrodynamic', 1), ('aerodynamic', 1), ('equilibrium', 1), ('condition', 1), ('create', 1), ('generalised', 1), ('course', 1), ('current', 1)]


In [9]:
input_sentence = sentences[14]
word_tokens = tokenizer.tokenize(str(input_sentence))
input_indices = [dico_word2index.get(w, '[UNK]') for w in word_tokens]

pc("Input sentence", input_sentence)
pc("Word tokens", word_tokens)
pc("Input indices", input_indices)

[34mInput sentence[0m: The genus name Physalia and species name physalis are both derived from the Greek word physalis, meaning "bubble" or "bladder".
[34mWord tokens[0m: ['the', 'genus', 'name', 'ph', '##ys', '##alia', 'and', 'species', 'name', 'ph', '##ys', '##alis', 'are', 'both', 'derived', 'from', 'the', 'greek', 'word', 'ph', '##ys', '##alis', ',', 'meaning', '"', 'bubble', '"', 'or', '"', 'bladder', '"', '.']
[34mInput indices[0m: [3, 153, 165, 26, 27, 48, 7, 39, 165, 26, 27, 82, 32, 166, 293, 28, 3, 488, 489, 26, 27, 82, 4, 294, 65, 490, 65, 23, 65, 109, 65, 5]


In [10]:
masking_probability = 0.15
label_for_unmasked_values = -100
maximum_sequence_length = 50
mask_index = dico_word2index["[MASK]"]
pad_index = dico_word2index["[PAD]"]

In [11]:
input_indices, mask, masked_indices, labels = apply_random_masking(                                                                                              
    input_indices=input_indices,                                                                                                 
    index_for_masked_values=mask_index,
    label_for_unmasked_values=label_for_unmasked_values,                                                                                 
     masking_probability=masking_probability)

print("Before padding / cropping")
pc("Input indices", input_indices)
pc("Mask", mask)
pc("Masked indices", masked_indices)
pc("Labels", labels)

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


Before padding / cropping
[34mInput indices[0m: [  3 153 165  26  27  48   7  39 165  26  27  82  32 166 293  28   3 488
 489  26  27  82   4 294  65 490  65  23  65 109  65   5]
[34mMask[0m: [0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
[34mMasked indices[0m: [  3   0 165   0  27   0   7  39 165  26  27  82  32 166   0  28   3 488
 489  26   0  82   4   0  65 490  65  23  65 109  65   5]
[34mLabels[0m: [-100  153 -100   26 -100   48 -100 -100 -100 -100 -100 -100 -100 -100
  293 -100 -100 -100 -100 -100   27 -100 -100  294 -100 -100 -100 -100
 -100 -100 -100 -100]


In [12]:
input_indices = pad_and_crop_to_maximum_length(input_indices, padding_value=pad_index, maximum_sequence_length=maximum_sequence_length)
mask = pad_and_crop_to_maximum_length(mask, padding_value=1, maximum_sequence_length=maximum_sequence_length)
masked_indices = pad_and_crop_to_maximum_length(masked_indices, padding_value=pad_index, maximum_sequence_length=maximum_sequence_length)
labels = pad_and_crop_to_maximum_length(labels, padding_value=-100, maximum_sequence_length=maximum_sequence_length)

print("After padding / cropping")
pc("Input indices", input_indices)
pc("Mask", mask)
pc("Masked indices", masked_indices)
pc("Labels", labels)

After padding / cropping
[34mInput indices[0m: [  3 153 165  26  27  48   7  39 165  26  27  82  32 166 293  28   3 488
 489  26  27  82   4 294  65 490  65  23  65 109  65   5   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1]
[34mMask[0m: [0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]
[34mMasked indices[0m: [  3   0 165   0  27   0   7  39 165  26  27  82  32 166   0  28   3 488
 489  26   0  82   4   0  65 490  65  23  65 109  65   5   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1]
[34mLabels[0m: [-100  153 -100   26 -100   48 -100 -100 -100 -100 -100 -100 -100 -100
  293 -100 -100 -100 -100 -100   27 -100 -100  294 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100]
