In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [4]:
!ls drive/My\ Drive/Colab\ Notebooks/bert

bert_cazzeggio.ipynb
BERT_Fine_Tuning_Sentence_Classification.ipynb
minitraining.tsv.gz
tweet_features_text_tokens_padded.csv.gz


In [None]:
BERT_PATH = "drive/My Drive/Colab Notebooks/bert"
COLUMN_NAME = "raw_tweet_features_text_tokens"
PATH = BERT_PATH+"/tweet_features_text_tokens_padded.csv.gz"
N_ROWS = 1000
CHUNKSIZE = 10
PAD = int(0)
MAX_LENGTH = 179
VOCAB = 'bert-base-multilingual-cased'

In [6]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np

In [17]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(VOCAB)

In [None]:
vocab_size = len(tokenizer.vocab.keys())

In [19]:
vocab_size

119547

In [20]:
tokenizer.vocab['[PAD]']

0

## Load tweets from csv file as lists of int tokens

In [None]:
# Declaring two lambdas in order to cast a string to a numpy array of integers
f_to_int = lambda x: int(x)
f_int = lambda x: list(map(f_to_int, x.replace('[', '').replace(']', '').replace(' ', '').split(',')))

In [None]:
def read_tweets_list(path):
    
    list_of_tweets = []
    
    for chunk in pd.read_csv(path,
                            chunksize=CHUNKSIZE,
                            names=[COLUMN_NAME],
                            #dtype={COLUMN_NAME: pd.Int32Dtype()},
                            nrows=N_ROWS,
                            header=None,
                            index_col=0,
                            compression='gzip'):
      #print(chunk)

      tweets = chunk[COLUMN_NAME]

      for t in tweets:
        t_list = f_int(t)
        list_of_tweets.append(t_list)

    return list_of_tweets

In [None]:
tokenized_tweets = read_tweets_list(PATH)

In [None]:
n_tweets = len(tokenized_tweets)

In [26]:
n_tweets

1000

## Check that  at least one tweet having the last token different from PAD exists

In [27]:
for tweet in tokenized_tweets:
  if tweet[-1] != PAD:
    print('Ok')

Ok


## Decode all the tweets in the list

In [None]:
decoded_tweets = []

In [None]:
for t in tokenized_tweets:
  decoded_tweets.append(tokenizer.decode(t))

In [30]:
len(decoded_tweets)

1000

## Find two tweets containing the words "obama" and "president"

In [31]:
i = 0
for t in decoded_tweets:
  if "president" in t or "obama" in t:
    print(i)
  i += 1

754
813


## Create the model from a pre-trained one

In [32]:
from transformers import BertModel, AdamW, BertConfig

# Load BERT model, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertModel.from_pretrained(
    VOCAB, # Use the 12-layer BERT model, with an uncased vocab.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [33]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 199 different named parameters.

==== Embedding Layer ====

embeddings.word_embeddings.weight                       (119547, 768)
embeddings.position_embeddings.weight                     (512, 768)
embeddings.token_type_embeddings.weight                     (2, 768)
embeddings.LayerNorm.weight                                   (768,)
embeddings.LayerNorm.bias                                     (768,)

==== First Transformer ====

encoder.layer.0.attention.self.query.weight               (768, 768)
encoder.layer.0.attention.self.query.bias                     (768,)
encoder.layer.0.attention.self.key.weight                 (768, 768)
encoder.layer.0.attention.self.key.bias                       (768,)
encoder.layer.0.attention.self.value.weight               (768, 768)
encoder.layer.0.attention.self.value.bias                     (768,)
encoder.layer.0.attention.output.dense.weight             (768, 768)
encoder.layer.0.attention.output.dense.bias                   

## Create inputs tensor

In [34]:
decoded_tweets[754]  # it contains the word "president"

"[CLS] Tot i només enlairar - se 25 minuts, el president espanyol va fer que l'helicòpter de la Guardia Civil estigués aturat fins a 3 hores perquè va arribar tard. https : / / t. co / o2eqbq4Rzr [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [35]:
decoded_tweets[813]  # it contains the word "obama"

'[CLS] RT @ MackSports : Justice Roberts refusal to read @ RandPaul all but confirms that obama holdout # EricCiarmarella was in fact the whistleblowe [UNK] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [None]:
model.eval() 

tweet1 = tokenized_tweets[754]
tweet2 = tokenized_tweets[813]

In [48]:
inputs

tensor([[  101, 22768,   177, 20840, 10110, 31181, 13859,   118, 10126, 10258,
         44156, 10107,   117, 10125, 12931, 51306, 10321, 13658, 10121,   180,
           112, 68431, 11130, 11243, 38384, 10104, 10109, 43397, 14906, 23933,
         60409, 10160, 85918, 12980,   169,   124, 59968, 27698, 10321, 26774,
         14613,   119, 14120,   131,   120,   120,   188,   119, 11170,   120,
           183, 10729, 10112, 11703, 10457, 11703, 11011, 11273, 10305, 10129,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

## Create attention mask

In [None]:
masks = []

masks.append([int(token_id > 0) for token_id in tweet1])  # mask PAD tokens (PAD == 0)
masks.append([int(token_id > 0) for token_id in tweet2])

masks = torch.tensor(masks)

In [50]:
masks.shape

torch.Size([2, 179])

In [51]:
masks

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Move tensors to GPU

In [None]:
inputs = inputs.to(device) 
masks = masks.to(device)

## Get model outputs

In [None]:
outputs = model(input_ids=inputs, attention_mask=masks)

In [54]:
outputs[0][0].size()

torch.Size([179, 768])

In [55]:
outputs[0][1].size()

torch.Size([179, 768])

In [56]:
outputs[1][0].size()

torch.Size([768])

In [57]:
outputs[1][1].size()

torch.Size([768])

#### The output is a tuple with 2 elements:
* a list containing lists of lists: for each input token we obtain a vector with 768 elements ---> for each input we have a list of MAX_LENGTH lists (one for each token), each containing 768 values (refer to this link http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/, in the "Flowing through DistillBERT" section)
* the second element is a list containing an embedding vector for each input

In [58]:
outputs

(tensor([[[-0.1751, -0.2052, -0.2639,  ...,  0.1913,  0.1934, -0.0669],
          [-0.3554, -0.8039,  0.7193,  ...,  0.4739,  0.9444,  0.4237],
          [ 0.6647, -0.9813,  0.3694,  ..., -0.0213,  1.3031, -0.0144],
          ...,
          [-0.1600, -0.5701,  0.2982,  ...,  0.5372,  0.3211, -0.3287],
          [-0.4301, -0.2024,  0.1352,  ...,  0.6117,  0.4735, -0.0979],
          [ 0.2527, -0.4959,  0.2744,  ...,  0.4732,  0.0835, -0.3619]],
 
         [[-0.0919, -0.1342, -0.6339,  ...,  0.3822, -0.0128,  0.0816],
          [-0.1758, -0.4905, -0.2090,  ...,  0.9553,  0.1433, -0.5804],
          [-0.3628, -0.5117,  0.4214,  ...,  0.0951,  0.4035, -0.4778],
          ...,
          [ 0.0929, -0.4169, -0.2635,  ...,  0.5439, -0.2696,  0.3367],
          [ 0.0016, -0.3117, -0.5787,  ...,  0.7612, -0.1398,  0.2072],
          [-0.0534, -0.2989, -0.7207,  ...,  0.8064, -0.1651,  0.2182]]],
        device='cuda:0', grad_fn=<NativeLayerNormBackward>),
 tensor([[ 0.3146, -0.1470,  0.1554,  ..

## Compute similarity between the two embedding vectors

In [None]:
emb1 = outputs[1][0]
emb2 = outputs[1][1]

In [60]:
torch.dot(emb1, emb2)  # dot product

tensor(67.6011, device='cuda:0', grad_fn=<DotBackward>)

In [61]:
cosine = torch.nn.CosineSimilarity(dim=0)

cosine(emb1, emb2)  # cosine similarity

tensor(0.9173, device='cuda:0', grad_fn=<DivBackward0>)

## Similarity between other two tweets

In [62]:
decoded_tweets[2]

'[CLS] SNCタルコフ 部 門 企 業 説 明 会 始 めます. https : / / t. co / iVjbBwKXHc [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [63]:
decoded_tweets[200]

"[CLS] Şuan Meclis'te bu yaşanıyor... Depremin araştırılmasını reddettiler Saray inşaatını görüşecekler. https : / / t. co / BKWxFLJKWh https : / / t. co / IvISJdc8gr [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [None]:
tweet1 = tokenized_tweets[2]
tweet2 = tokenized_tweets[200]

In [None]:
inputs = torch.tensor([tweet1, tweet2])

masks = []

masks.append([int(token_id > 0) for token_id in tweet1])  # mask PAD tokens (PAD == 0)
masks.append([int(token_id > 0) for token_id in tweet2])

masks = torch.tensor(masks)

inputs = inputs.to(device) 
masks = masks.to(device)

outputs = model(input_ids=inputs, attention_mask=masks)

emb1 = outputs[1][0]
emb2 = outputs[1][1]

In [66]:
torch.dot(emb1, emb2)  # dot product

tensor(17.2633, device='cuda:0', grad_fn=<DotBackward>)

In [67]:
cosine = torch.nn.CosineSimilarity(dim=0)

cosine(emb1, emb2)  # cosine similarity

tensor(0.4414, device='cuda:0', grad_fn=<DivBackward0>)