# Load BookCorpus

In [None]:
import joblib
import timeit
start = timeit.default_timer()
Bookcorpus = joblib.load('bookcorpus_path')
stop = timeit.default_timer()
print('time load BookCorpus (Sec): ', stop-start)

# Initialize BERT and text processing libraries

In [4]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to('cuda')
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import math
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import tagset_mapping, map_tag
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
from sklearn.decomposition import PCA
import random
import warnings
warnings.filterwarnings("ignore") 

# Collecting word embedding and its properties

In [None]:
prop_dict=dict() # collecting results: 
# keys = (word, part of speech, word type), values = [frequency, word embedding centroid, average l2-norm]
start1 = timeit.default_timer()
corpus_index = -1 # index of the dataset
for sentence in Bookcorpus:
    corpus_index+=1
    if len(sentence.split(' '))>=512: # too long for BERT to produce word embeddings
        pass
    else:
        tokens = gensim.utils.simple_preprocess(sentence, min_len=2) # tokenize
        tokenized_words = [] # tokenized words
        emb_index = [] # index of the current word embedding
        tokenized_words.append('[CLS]')
        for word_sim in tokens:
            len_now = 0
            for tokenized in tokenizer.tokenize(word_sim): 
                len_now+=1 # current word embedding index
                tokenized_words.append(tokenized) 
            emb_index.append([len(tokenized_words) -1 - i for i in range(len_now)])
        tokenized_words.append('[SEP]')
        # create token id and segment id
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_words)
        segments_ids = [1] * len(tokenized_words)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
        segments_tensors = torch.tensor([segments_ids]).to('cuda')
        # BERT forward propagation
        try:
            with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)
            con=True # not causing error
        except:
            con=False # causing error
        if con == True: # not causing error
            pos = nltk.pos_tag(tokens) # part of speech tagging
            for word_index in range(len(tokens)):
                word_pos = map_tag('en-ptb', 'universal', pos[word_index][1]) # pos tag
                if tokens[word_index] in gensim.parsing.preprocessing.STOPWORDS: # check stop or non-stop word
                    word_stop = 'STOP'
                else:
                    word_stop = 'NON_STOP'
                    
                # creating word embedding
                emb = sum([encoded_layers[11][0][temp_index].to('cpu') for temp_index in emb_index[word_index]])/len(emb_index[word_index])
                # l2-norm
                norm = np.linalg.norm(emb)

                # update results in the dictionary
                if (tokens[word_index], word_pos, word_stop) not in prop_dict.keys():
                    prop_dict[(tokens[word_index], word_pos, word_stop)] = [1, emb, norm]
                else:
                    prop_dict[(tokens[word_index], word_pos, word_stop)][0]+=1
                    prop_dict[(tokens[word_index], word_pos, word_stop)][1]+=emb
                    prop_dict[(tokens[word_index], word_pos, word_stop)][2]+=norm
    # print the iteration
    if (corpus_index+1)%500000==0:
        stop1 = timeit.default_timer()
        print(corpus_index+1, '// run time (Sec): ', stop1-start1)
        start1 = timeit.default_timer()

# Save the results

In [None]:
joblib.dump(prop_dict, 'path_to_save_the_results')