# 1. Loading Pre-Trained BERT

In [None]:
!pip install pytorch-pretrained-bert --quiet

[K     |████████████████████████████████| 123 kB 5.2 MB/s 
[K     |████████████████████████████████| 132 kB 45.7 MB/s 
[K     |████████████████████████████████| 79 kB 2.4 MB/s 
[K     |████████████████████████████████| 8.7 MB 61.0 MB/s 
[K     |████████████████████████████████| 138 kB 53.9 MB/s 
[K     |████████████████████████████████| 127 kB 60.1 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[?25h

In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# Load pre-trained model tokenizer (vocabulary-multilingual)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
100%|██████████| 213450/213450 [00:00<00:00, 2126395.54B/s]


## 2. Sentence Tokenization


BERT provides its own tokenizer, which we imported above. Let's see how it handles the below sentence.

In [None]:
text = "Agregar mucha salsa molcajeteada, limones, y totopos ... gracias "
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)
segments_ids = [1] * len(tokenized_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'A', '##g', '##re', '##gar', 'much', '##a', 'sa', '##ls', '##a', 'm', '##ol', '##ca', '##jet', '##ead', '##a', ',', 'limo', '##nes', ',', 'y', 'to', '##top', '##os', '.', '.', '.', 'g', '##rac', '##ias', '[SEP]']


# 3. Extracting Embeddings 



In [None]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-cased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

100%|██████████| 404400730/404400730 [00:11<00:00, 36367788.89B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [None]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)

# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)

# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([31, 12, 768])

## 3.1 Word Vectors

There are many methods to extract the word vectors from BERT. A simple solution is to create the word vectors by summing together the last four layers.




In [None]:
# Stores the token vectors, with shape [23 x 768]
token_vecs_sum = []

# `token_embeddings` is a [23 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 31 x 768


In [None]:
token_vecs_sum[3][:15]

tensor([ 4.9341, -5.9615, -1.0122,  4.8388,  1.4223, -0.4690, -0.3854,  2.5350,
        -2.0847, -1.9974, -2.5810, -0.1155,  6.2884, -1.1851, -3.7856])

## 3.2 Sentence Vectors


To get a single vector for our entire sentence we have multiple application-dependent strategies, but a simple approach is to average the second to last hiden layer of each token producing a single 768 length vector.

In [None]:
# `encoded_layers` has shape [12 x 1 x 23 x 768]

# `token_vecs` is a tensor with shape [23 x 768]
token_vecs = encoded_layers[11][0]

# Calculate the average of all 23 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [None]:
print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

Our final sentence embedding vector of shape: torch.Size([768])


In [None]:
sentence_embedding

tensor([ 1.5676e-01,  9.0309e-02,  2.3519e-01,  2.7717e-01, -2.8722e-02,
         2.5228e-01, -2.7609e-02,  1.5562e-01, -1.9589e-01,  4.9751e-02,
        -1.1397e-02,  4.3979e-01, -2.2737e-01,  2.9419e-01, -1.5648e-02,
        -1.0231e-01,  1.2727e-01,  3.0744e-01, -1.7944e-01,  1.5354e-01,
         5.0796e-02, -9.8759e-02, -2.8601e-01,  7.6843e-02, -1.8348e-01,
        -6.3703e-01, -9.1637e-02,  4.0376e-01, -2.4961e-01,  1.4805e-01,
        -2.0682e-02,  5.5793e-03, -8.4749e-02,  2.7743e-02, -1.0231e-02,
         2.0917e-01, -7.3548e-02,  2.4858e-01, -2.3741e-01, -3.5060e-02,
         2.1387e-01, -3.2657e-02, -1.7652e-01,  4.3070e-01,  2.9573e-01,
        -2.4952e-01,  5.1670e-02, -1.6528e-01, -5.4836e-01,  1.1053e-01,
         1.8437e-01,  1.3851e-01, -1.0443e-02, -8.8564e-02,  4.7423e-01,
        -1.6720e-01, -2.8161e-01, -2.8278e-01,  1.6190e-02,  6.1298e-02,
        -2.6301e-01,  3.9287e-01,  2.6594e-01,  1.0703e-01,  1.5469e-01,
         1.4666e-01,  1.2119e-01, -1.1021e-01,  2.7

# BERT Embeddings for FakeNews Dataset Articles

In [None]:
import pandas as pd

train_df = pd.read_csv("./train.tsv", sep="\t", header=None)
data = train_df.values

articles = [article[2] for article in data]

article_embeddings = []
for text in articles:
    sentence_embedding = torch.mean(token_vecs, dim=0)
    article_embeddings.append(sentence_embedding)

print(article_embeddings[0])

tensor([ 1.5676e-01,  9.0309e-02,  2.3519e-01,  2.7717e-01, -2.8722e-02,
         2.5228e-01, -2.7609e-02,  1.5562e-01, -1.9589e-01,  4.9751e-02,
        -1.1397e-02,  4.3979e-01, -2.2737e-01,  2.9419e-01, -1.5648e-02,
        -1.0231e-01,  1.2727e-01,  3.0744e-01, -1.7944e-01,  1.5354e-01,
         5.0796e-02, -9.8759e-02, -2.8601e-01,  7.6843e-02, -1.8348e-01,
        -6.3703e-01, -9.1637e-02,  4.0376e-01, -2.4961e-01,  1.4805e-01,
        -2.0682e-02,  5.5793e-03, -8.4749e-02,  2.7743e-02, -1.0231e-02,
         2.0917e-01, -7.3548e-02,  2.4858e-01, -2.3741e-01, -3.5060e-02,
         2.1387e-01, -3.2657e-02, -1.7652e-01,  4.3070e-01,  2.9573e-01,
        -2.4952e-01,  5.1670e-02, -1.6528e-01, -5.4836e-01,  1.1053e-01,
         1.8437e-01,  1.3851e-01, -1.0443e-02, -8.8564e-02,  4.7423e-01,
        -1.6720e-01, -2.8161e-01, -2.8278e-01,  1.6190e-02,  6.1298e-02,
        -2.6301e-01,  3.9287e-01,  2.6594e-01,  1.0703e-01,  1.5469e-01,
         1.4666e-01,  1.2119e-01, -1.1021e-01,  2.7