In [1]:
!pip install -U transformers
!pip install -U emoji
!pip install -U ipywidgets # interactive browser controls for Jupyter notebooks

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
      Successfully uninstalled transformers-4.48.3
Successfully installed transformers-4.49.0
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1
Collecti

## Libraries

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from transformers import AutoModel, AutoTokenizer

## Which device?

In [3]:
if torch.backends.mps.is_available():  # Mac M? GPU
    device = torch.device("mps")
elif torch.cuda.is_available():  # Nvidia GPU
    device = torch.device("cuda")
else:  # CPU
    device = torch.device("cpu")
print(device)

cuda


###EXAMPLE 1: Tokenization with BERT Tokenizer

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# The max_length parameter depends on the  texts' length in the dataset

# Examples of the tokenization performed by BERT·
example_text = ["Yo it shirt he gave New York was funny.",
                "my husband is sick, homemade chicken soup loading.",
                "Witch raises wind to break up enemy’s lumber pound.",
                "i got a new shirt at work at it is the WORST material ever",
                "wave is so poor the girls can’t even dress up for Halloween",
                "The theory of paint indicates feelings of isolation in society.",
                "Wish sometimes I had access to the ever elusive cock carousel..",
                "off probation tomorrow ima be a free woman again, FUCK THE SYSTEM"]


tokenized_text = tokenizer.tokenize(example_text[0])
print(tokenized_text)
tokenized_text = tokenizer.tokenize(example_text[1])
print(tokenized_text)
bert_input = tokenizer(example_text, padding="max_length", max_length = 30,truncation=True, return_tensors="pt")
# Transform tokens to vocabulary indexes
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

#BERT WORKS WITH THESE REPRESENTATIONS OBTAINED FOR THE TOKENS
print(bert_input["input_ids"])
print(bert_input["token_type_ids"])
print(bert_input["attention_mask"])

#To transform the sequence of token indices to words in the texts
example_text = tokenizer.decode(bert_input.input_ids[0])
print(example_text)
example_text = tokenizer.decode(bert_input.input_ids[1])
print(example_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['yo', 'it', 'shirt', 'he', 'gave', 'new', 'york', 'was', 'funny', '.']
['my', 'husband', 'is', 'sick', ',', 'homemade', 'chicken', 'soup', 'loading', '.']
tensor([[  101, 10930,  2009,  3797,  2002,  2435,  2047,  2259,  2001,  6057,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2026,  3129,  2003,  5305,  1010, 25628,  7975, 11350, 10578,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  6965, 13275,  3612,  2000,  3338,  2039,  4099,  1521,  1055,
         13891,  9044,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1045,  2288,  1037,  2047,  3797,  2012,  2147,  2012,  2009,
          2003,  1996,  5409,  3430,  2412,   102, 

###EXAMPLE 2: Obtaining contextualized word embeddings using the pre-trained BERT model

In [5]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")

#The max_length parameter depends on the  texts' length in the dataset

example_text = ["Yo it shirt he gave New York was funny.",
                "my husband is sick, homemade chicken soup loading.",
                "Witch raises wind to break up enemy’s lumber pound.",
                "i got a new shirt at work at it is the WORST material ever",
                "wave is so poor the girls can’t even dress up for Halloween",
                "The theory of paint indicates feelings of isolation in society.",
                "Wish sometimes I had access to the ever elusive cock carousel..",
                "off probation tomorrow ima be a free woman again, FUCK THE SYSTEM"]

bert_input = tokenizer(example_text, padding="max_length", max_length = 100,
                      truncation=True, return_tensors="pt")

model = BertModel.from_pretrained("bert-base-uncased")

#Setting the evaluation mode, this option does not make gradient updating
model.eval()

# Send the data to mps, cuda or cpu
bert_input = bert_input.to(device)
model.to(device)
with torch.no_grad():
    outputs = model(**bert_input)
    # Transformers models always return tuples.
    # Here, the first element corresponds to the vectors in the output of the last BERT layer.
    encoded_layers = outputs[0]
    print(encoded_layers.size())

    # Here, we obtain the embedding of the CLS tokens for each input text.
    # This representation serves as a contextual embedding of the texts.
    cls_vector = encoded_layers[:,0,:]
    print(cls_vector.size())
    i=1
    for vect in cls_vector.cpu().detach().numpy():
       print(f"This is the CLS vector for the document D{i}", vect)
       i+=1
    # Vector associated with the CLS token of the first text in the entry.
    cls_vector = cls_vector.cpu().detach().numpy()[0]
    print(len(cls_vector))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

torch.Size([8, 100, 768])
torch.Size([8, 768])
This is the CLS vector for the document D1 [-4.81657311e-03 -1.68286279e-01 -1.61941364e-01  2.94780254e-01
 -4.74430770e-01 -1.31986514e-01 -1.55701056e-01  7.05417037e-01
  2.30654806e-01 -1.04499899e-01  3.68017942e-01 -2.60411084e-01
 -8.50365907e-02  4.12882268e-01  6.49188012e-02  1.68695357e-02
 -3.82843435e-01  1.00729957e-01  1.51795745e-02  8.66137296e-02
 -1.60618196e-03 -8.37803409e-02 -1.34181321e-01 -1.80882663e-01
  3.67154903e-03 -2.84712523e-01  8.25745333e-03 -3.29370558e-01
  1.38740927e-01  5.25028050e-01  4.81629185e-02  3.30331385e-01
 -2.16747105e-01 -5.04435956e-01  1.86968986e-02 -2.29070276e-01
  1.01204328e-01 -1.50575981e-01  1.99357808e-01 -2.18662784e-01
 -2.03818008e-01  1.08661085e-01  1.49584264e-01 -3.22613716e-01
 -1.24597371e-01 -4.15497899e-01 -3.40324092e+00  1.37040794e-01
 -1.27590492e-01 -2.20760569e-01  2.29760557e-01 -4.31368083e-01
 -1.12350341e-02  4.07518357e-01  4.28484410e-01  4.89108384e-01


Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

###EXAMPLE 3: Tokenization with RoBERTa Tokenizer

In [8]:
tokenizer= RobertaTokenizer.from_pretrained("roberta-base")

#Examples of the tokenization performed by RoBERTa
example_text = ["Yo it shirt he gave New York was funny.",
                "my husband is sick, homemade chicken soup loading.",
                "Witch raises wind to break up enemy’s lumber pound.",
                "i got a new shirt at work at it is the WORST material ever",
                "wave is so poor the girls can’t even dress up for Halloween",
                "The theory of paint indicates feelings of isolation in society.",
                "Wish sometimes I had access to the ever elusive cock carousel..",
                "off probation tomorrow ima be a free woman again, FUCK THE SYSTEM"]

#The max_length parameter depends on the  texts' length in the dataset


tokenized_text = tokenizer.tokenize(example_text[0])
print(tokenized_text)
tokenized_text = tokenizer.tokenize(example_text[1])
print(tokenized_text)
roberta_input = tokenizer(example_text, padding="max_length", max_length = 30,
                      truncation=True, return_tensors="pt")

# Transform tokens to vocabulary indexes
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
#RoBERTa WORKS WITH THESE REPRESENTATIONS OBTAINED FOR THE TOKENS
print(roberta_input["input_ids"])
print(roberta_input["attention_mask"])
#To transform the sequence of token indices to words in the texts
example_text = tokenizer.decode(roberta_input.input_ids[0])
print(example_text)
example_text = tokenizer.decode(roberta_input.input_ids[1])
print(example_text)

['Yo', 'Ġit', 'Ġshirt', 'Ġhe', 'Ġgave', 'ĠNew', 'ĠYork', 'Ġwas', 'Ġfunny', '.']
['my', 'Ġhusband', 'Ġis', 'Ġsick', ',', 'Ġhomemade', 'Ġchicken', 'Ġsoup', 'Ġloading', '.']
tensor([[    0, 33543,    24,  6399,    37,   851,   188,   469,    21,  6269,
             4,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    0,  4783,  1623,    16,  4736,     6, 17798,  5884, 14532, 16761,
             4,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    0,   771,  3239,  7700,  2508,     7,  1108,    62,  8636,    17,
            27,    29, 24829,  6881,     4,     2,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    0,   118,   300,    10,    92,  6399,    23,   173,    23,    24,
            16,     5, 31534,  4014,

###EXAMPLE 4: Obtaining contextualized word embeddings using the pre-trained RoBERTa model

In [9]:
tokenizer=RobertaTokenizer.from_pretrained("roberta-base")

example_text = ["Yo it shirt he gave New York was funny.",
                "my husband is sick, homemade chicken soup loading.",
                "Witch raises wind to break up enemy’s lumber pound.",
                "i got a new shirt at work at it is the WORST material ever",
                "wave is so poor the girls can’t even dress up for Halloween",
                "The theory of paint indicates feelings of isolation in society.",
                "Wish sometimes I had access to the ever elusive cock carousel..",
                "off probation tomorrow ima be a free woman again, FUCK THE SYSTEM"]

#The max_length parameter depends on the  texts' length in the dataset

roberta_input = tokenizer(example_text, padding="max_length", max_length = 50,
                      truncation=True, return_tensors="pt")
model = RobertaModel.from_pretrained("roberta-base")
#Setting the evaluation mode, this option does not make gradient updating
model.eval()

# Send the data to mps, cuda or cpu
roberta_input = roberta_input.to(device)
model.to(device)

with torch.no_grad():
   outputs = model(**roberta_input)
   # Transformers models always return tuples.
   # Here, the first element corresponds to the vectors in the output of the last RoBERTa layer.
   encoded_layers = outputs[0]
   print(encoded_layers.size())
   #Here we obtain the embedding of the CLS tokens for each input text.
   #This representation serves as a contextual embedding of the texts.
   cls_vector = encoded_layers[:,0,:]
   print(cls_vector.size())
   #Vector associated with the CLS token of the first text in the entry.
   cls_vector = cls_vector.cpu().detach().numpy()[0]
   print(len(cls_vector))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([8, 50, 768])
torch.Size([8, 768])
768


###EXAMPLE 5: Do it in Spanish. Tokenization with BETO Tokenizer

In [10]:
tokenizer= BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

#The max_length parameter depends on the  texts' length in the dataset
#Examples of the tokenization performed by BETO·
example_text = ["Cada vez más ganas de ponerme rubia tonta",
                "Dios cómo odio a los tíos que hacen manspreading",
                "La frente muy alta, la lengua muy larga y la falda muy corta",
                "Mis respetos pa la mujeres no pueden ser tan sorneras ajajajaj",
                "No gusto de mujer sumisa, me gusta mi mujer toposa y toxica😌",
                "Las mujeres rápidas y los caballos lentos arruinarán tu vida.",
                "hoy dejo d ser varón trans para elegir el camino de la misandria",
                "mucho feminismo pero te le tiras a vagos con novia jsjsja mi vida"]
tokenized_text = tokenizer.tokenize(example_text[0])
print(tokenized_text)
tokenized_text = tokenizer.tokenize(example_text[1])
print(tokenized_text)
beto_input = tokenizer(example_text,padding="max_length", max_length = 20,
                      truncation=True, return_tensors="pt")


# Transform tokens to vocabulary indexes
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
#BETO WORKS WITH THESE REPRESENTATIONS OBTAINED FOR THE TOKENS
print(beto_input["input_ids"])
print(beto_input["token_type_ids"])
print(beto_input["attention_mask"])
#To transform the sequence of token indices to words in the texts
example_text = tokenizer.decode(beto_input.input_ids[0])
print(example_text)
example_text = tokenizer.decode(beto_input.input_ids[1])
print(example_text)

tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

['cada', 'vez', 'más', 'ganas', 'de', 'ponerme', 'rubia', 'tonta']
['dios', 'cómo', 'odio', 'a', 'los', 'tíos', 'que', 'hacen', 'mans', '##pre', '##adi', '##ng']
tensor([[    4,  1748,  1434,  1186,  7716,  1009, 13203, 15456, 10833,     5,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    4,  1645,  1475,  4894,  1012,  1067, 13353,  1041,  3451, 22197,
          3203, 14051, 22396,     5,     1,     1,     1,     1,     1,     1],
        [    4,  1032,  2767,  1355,  3531,  1019,  1032,  4002,  1355,  5163,
          1040,  1032, 20466,  1355,  7350,     5,     1,     1,     1,     1],
        [    4,  1285, 30397,  1188,  1032,  2209,  1054,  1948,  1170,  1370,
          2965,  3568,  1021,  8613, 30956,  1431,  1431, 30981,     5,     1],
        [    4,  1054,  4944,  1009,  1626,  5216,  2785,  1019,  1094,  2331,
          1136,  1626, 17628,  1949,  1040,     3,     5,     1,     1,     1],
        [    4,  1085,  2209, 23110,  1040,

###EXAMPLE 6: Obtaining contextualized word embeddings using the pre-trained BETO model

In [14]:
modelname = "dccuchile/bert-base-spanish-wwm-uncased"

tokenizer=BertTokenizer.from_pretrained(modelname)

example_text = ["Cada vez más ganas de ponerme rubia tonta",
                "Dios cómo odio a los tíos que hacen manspreading",
                "La frente muy alta, la lengua muy larga y la falda muy corta",
                "Mis respetos pa la mujeres no pueden ser tan sorneras ajajajaj",
                "No gusto de mujer sumisa, me gusta mi mujer toposa y toxica😌",
                "Las mujeres rápidas y los caballos lentos arruinarán tu vida.",
                "hoy dejo d ser varón trans para elegir el camino de la misandria",
                "mucho feminismo pero te le tiras a vagos con novia jsjsja mi vida"]

#The max_length parameter depends on the  texts' length in the dataset

beto_input = tokenizer(example_text, padding="max_length", max_length = 50, truncation=True, return_tensors="pt")

model = BertModel.from_pretrained(modelname)
#Setting the evaluation mode, this option does not make gradient updating
model.eval()

# Send the data to mps, cuda or cpu
beto_input = beto_input.to(device)
model.to(device)

with torch.no_grad():
    outputs = model(**beto_input)
    # Transformers models always return tuples.
    # Here, the first element corresponds to the vectors in the output of the last BETO layer.
    encoded_layers = outputs[0]
    print(encoded_layers.size())
    #Here we obtain the embedding of the CLS tokens for each input text.
    #This representation serves as a contextual embedding of the texts.
    cls_vector = encoded_layers[:,0,:]
    print(cls_vector.size())
    #Vector associated with the CLS token of the first text in the entry.
    cls_vector = cls_vector.cpu().detach().numpy()[0]
    print(len(cls_vector))

outputs[0].shape, outputs[1].shape
outputs.last_hidden_state
outputs.pooler_output

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([8, 50, 768])
torch.Size([8, 768])
768


tensor([[-0.0710,  0.2814, -0.5905,  ...,  0.2550, -0.3860, -0.1612],
        [ 0.3384, -0.1886, -0.4892,  ...,  0.2903, -0.1287,  0.2523],
        [ 0.4271,  0.0325, -0.0673,  ..., -0.0491, -0.4599,  0.6828],
        ...,
        [ 0.0885,  0.5217, -0.4717,  ..., -0.3405, -0.1553,  0.5835],
        [ 0.0009, -0.3102, -0.2553,  ...,  0.2261, -0.0835, -0.0375],
        [ 0.0414, -0.0412, -0.3780,  ..., -0.3067, -0.2024,  0.1136]],
       device='cuda:0')

###EXAMPLE 7: Using batches

In [20]:
modelname = "dccuchile/bert-base-spanish-wwm-uncased"

tokenizer=BertTokenizer.from_pretrained(modelname)
example_text = ["Cada vez más ganas de ponerme rubia tonta",
                "Dios cómo odio a los tíos que hacen manspreading",
                "La frente muy alta, la lengua muy larga y la falda muy corta",
                "Mis respetos pa la mujeres no pueden ser tan sorneras ajajajaj",
                "No gusto de mujer sumisa, me gusta mi mujer toposa y toxica😌",
                "Las mujeres rápidas y los caballos lentos arruinarán tu vida.",
                "hoy dejo d ser varón trans para elegir el camino de la misandria",
                "mucho feminismo pero te le tiras a vagos con novia jsjsja mi vida"]
#The max_length parameter depends on the  texts' length in the dataset

batch_size=2
tensor_list=[]
for i in range(0, len(example_text), batch_size):
    batch = example_text[i:i+batch_size]
    input = tokenizer(batch, padding="max_length", max_length = 50, truncation=True, return_tensors="pt")
    model.eval()
    model.to(device)
    input = input.to(device)
    with torch.no_grad():
      outputs = model(**input)
      # Transformers models always return tuples.
      # Here, the first element corresponds to the vectors in the output of the last BETO layer.
      encoded_layers = outputs[0]
      print(encoded_layers.size())
      #Here we obtain the embedding of the CLS tokens for each input text.
      #This representation serves as a contextual embedding of the texts.
      cls_vector = encoded_layers[:,0,:]
      #Vector associated with the CLS token of the first text in the entry.
    tensor_list.append(cls_vector)
cls_vector = torch.cat(tensor_list).cpu()#.detach().numpy()[0]
print(cls_vector.size())
print(len(cls_vector))



torch.Size([2, 50, 768])
torch.Size([2, 50, 768])
torch.Size([2, 50, 768])
torch.Size([2, 50, 768])
torch.Size([8, 768])
8


torch.Size([2, 768])