# Prepare static BERT embeddings for intrinsic evaluation
Copyright (C) 2021 ServiceNow, Inc.

BERT embeds tokens/words dynamically with context. However, we would still like to find a reasonable way to obtain static embeddings for words and use the intrinsic evaluation tasks to compare it with the GloVe embeddings. The approach we adopt is that we feed in [CLS] + word + [SEP] as the input, then perform average pooling on the last layer hidden states that correspond to the word. We use the averaged vector of dim (768, ) as the representation of the word.

Note that since our BERT models are trained on sentences, using single words as input may not generate as powerful embeddings as it could with full sentences.

In [97]:
import pathlib
REPO_DIR = pathlib.Path(__name__).parent.absolute().parent
print(REPO_DIR)

/nrcan_p2/workspace/tianyi/nrcan_p2


In [2]:
import sys
sys.path.append(str(REPO_DIR))

In [3]:
# output directory of the embedding map
OUTPUT_FOLDER = '/nrcan_p2/data/07_model_output/bert_geology_evaluation'
EMBEDDING_FOLDER = '/nrcan_p2/data/06_models/bert_nrcan_embeddings'

In [4]:
%load_ext autoreload
%autoreload 2

In [61]:
def load_words(words):
    words = [word.lower().split() for l in words for word in l]
    words = set([w for s in words for w in s])
    return words

In [5]:
from nrcan_p2.evaluation.load_test_data import (
    load_analogy_data, load_similarity_data, load_nearest_neighbour_data, load_word_cluster_data)

In [6]:
analogy_tests = load_analogy_data(REPO_DIR / 'nrcan_p2/evaluation/2020_ElementAI_Test_CJML_02_Dec-16-2020 - Analogy.csv')
display(analogy_tests)

Analogy data contains: 66 analogies in 9 themes


Unnamed: 0,Task,Theme,a,x,b,y
0,Analogy,Geology,Abitibi,Superior,Stikinia,Cordilleran
1,Analogy,Geology,Core,Mantle,Phenocryst,Rim
2,Analogy,Geology,Core,Inner,Crust,Outer
3,Analogy,Geology,Deposition,Sedimentation,Crystallization,Magmatism
4,Analogy,Geology,Diverging,Ridge,Converging,Mountain
...,...,...,...,...,...,...
61,Analogy,Geology_Petrology_Sedimentary,Turbidite,Marine,Sandstone,Fluvial
62,Analogy,Geology_Structural,Gouge,Unconsolidated,Cataclasite,Consolidated
63,Analogy,Geology_Structural,Norma,Extension,Reverse,Compression
64,Analogy,Geology_Structural,Plunge,Lineation,Strike,Planar


In [62]:
analogy_words = analogy_tests[["a", "x", "b", "y"]].values.tolist()
analogy_words = load_words(analogy_words)
analogy_words

{'abitibi',
 'actinolite',
 'albite',
 'alkaline',
 'amphibole',
 'amphibolite',
 'andesite',
 'angular',
 'anisotropic',
 'anorthite',
 'anticline',
 'aphanitic',
 'archean',
 'arenite',
 'arkose',
 'asthenosphere',
 'basalt',
 'basanite',
 'basin',
 'batholith',
 'biotite',
 'black',
 'blueschist',
 'breccia',
 'calcite',
 'carbonate',
 'cataclasite',
 'chalcopyrite',
 'chemical',
 'chlorite',
 'clastic',
 'clay',
 'coarse',
 'cold',
 'compression',
 'concordant',
 'conductive',
 'conglomerate',
 'consolidated',
 'convecting',
 'converging',
 'cooling',
 'copper',
 'cordilleran',
 'core',
 'crust',
 'crystallization',
 'cyclosilicate',
 'deep',
 'deposition',
 'detritus',
 'diamond',
 'diopside',
 'discordant',
 'diverging',
 'drumlin',
 'dyke',
 'dykes',
 'earth',
 'eon',
 'epidote',
 'erosion',
 'eruption',
 'extension',
 'extrusive',
 'fast',
 'fault',
 'feldspar',
 'felsic',
 'fine',
 'fluid',
 'fluvial',
 'fold',
 'foliated',
 'fossil',
 'geochemistry',
 'glacier',
 'gneiss',
 '

In [8]:
relatedness_tests = load_similarity_data(REPO_DIR / 'nrcan_p2/evaluation/2020_ElementAI_Test_CJML_03_Feb-23-2020-Relatedness.csv')
display(relatedness_tests)

Relatedness data contains: 249 relatedness examples in 14 themes


Unnamed: 0,Task,Theme,Include synonym,Spelling variation,y,a,x (Words with decreasing similarity ->),Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,l
0,Relatedness,Earth System Science,No,,,Age,Uncertainty,,,,...,,,,,,,,,,"[Age, Uncertainty]"
1,Relatedness,Earth System Science,No,,,Air,Stratosphere,,,,...,,,,,,,,,,"[Air, Stratosphere]"
2,Relatedness,Earth System Science,No,,,Alkali,Potassium,,,,...,,,,,,,,,,"[Alkali, Potassium]"
3,Relatedness,Earth System Science,No,,,Alpine,Mountain,,,,...,,,,,,,,,,"[Alpine, Mountain]"
4,Relatedness,Earth System Science,No,,,Atom,Nucleus,,,,...,,,,,,,,,,"[Atom, Nucleus]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,Relatedness,Methods,No,,,Magnetic,Susceptibility,,,,...,,,,,,,,,,"[Magnetic, Susceptibility]"
245,Relatedness,Methods,No,,,Plasma,Laser,,,,...,,,,,,,,,,"[Plasma, Laser]"
246,Relatedness,Methods,No,,,Reference,Standard,,,,...,,,,,,,,,,"[Reference, Standard]"
247,Relatedness,Methods,No,,,Seismic,Wave,,,,...,,,,,,,,,,"[Seismic, Wave]"


In [63]:
relatedness_words = relatedness_tests[["x (Words with decreasing similarity ->)", "a"]].values.tolist()
relatedness_words = load_words(relatedness_words)
relatedness_words

{'abitibi',
 'abrasion',
 'acicular',
 'age',
 'air',
 'albite',
 'alkali',
 'almandite',
 'alpine',
 'alteration',
 'aluminim',
 'amphibolite',
 'analysis',
 'angular',
 'anorthite',
 'anorthosite',
 'anthropogenic',
 'arc',
 'archean',
 'arcuate',
 'arenite',
 'argillic',
 'arkose',
 'ash',
 'athabasca',
 'atom',
 'auerole',
 'augite',
 'barometer',
 'barrier',
 'basalt',
 'base',
 'bedding',
 'biology',
 'biotite',
 'bladed',
 'blueschist',
 'boulder',
 'boundary',
 'breccia',
 'burial',
 'buttress',
 'calcsilicate',
 'carbonate',
 'cataclasite',
 'categorical',
 'cementation',
 'chalk',
 'change',
 'charnokite',
 'chert',
 'chlorine',
 'chloritic',
 'chrome',
 'clay',
 'cleavage',
 'climate',
 'coarse',
 'coast',
 'cobalt',
 'cobble',
 'compaction',
 'complex',
 'component',
 'conductive',
 'conduit',
 'conglomerate',
 'contact',
 'contemporary',
 'continuous',
 'copper',
 'coquine',
 'cordillera',
 'crystal',
 'cumulate',
 'curved',
 'dacite',
 'death',
 'deformed',
 'density',
 '

In [10]:
nn_tests = load_nearest_neighbour_data(REPO_DIR / 'nrcan_p2/evaluation/2020_ElementAI_Test_CJML_03_Jan-4-2020-NearestNeighbours.csv')
display(nn_tests)

Nearest neighbor data contains: 5 words


Unnamed: 0,ID,Word
0,1,Earth
1,2,Exploration
2,3,Environment
3,4,Climate
4,5,Hazard


In [64]:
nn_words = nn_tests[["Word"]].values.tolist()
nn_words = load_words(nn_words)
nn_words

{'climate', 'earth', 'environment', 'exploration', 'hazard'}

In [53]:
cluster_tests = load_word_cluster_data(REPO_DIR / 'nrcan_p2/evaluation/2020_ElementAI_Test_CJML_03_Jan-4-2020-WordClusters.csv')
display(cluster_tests)

Word cluster data contains: 16 clusters and 900 total words


Unnamed: 0,Cluster,Word
0,Alteration types,argillic
1,Alteration types,albitic
2,Alteration types,alunitic
3,Alteration types,calcsilicate
4,Alteration types,carbonate
...,...,...
895,Exploration activity,sampling
896,Exploration activity,subsurface
897,Exploration activity,surface
898,Exploration activity,systematic


In [65]:
cluster_words = cluster_tests[["Word"]].values.tolist()
cluster_words = load_words(cluster_words)
cluster_words

{'abyssal',
 'accretion',
 'accumulation',
 'acid',
 'acidic',
 'aeolian',
 'agate',
 'aggregate',
 'albitic',
 'alkali',
 'alluvial',
 'alteration',
 'alumina',
 'aluminium',
 'aluminosilicate',
 'alunite',
 'alunitic',
 'amazonite',
 'amber',
 'amethyst',
 'amphibolite',
 'amygdule',
 'andalusite',
 'andesite',
 'angle',
 'angular',
 'anhedral',
 'anhydrite',
 'anisotropic',
 'anorthosite',
 'anorthositic',
 'anoxic',
 'anthophyllite',
 'anthracite',
 'anthropogenic',
 'antimony',
 'apatite',
 'aphanite',
 'aplite',
 'aquamarine',
 'arc',
 'arch',
 'arcuate',
 'arenite',
 'argillic',
 'arid',
 'arkosic',
 'arsenic',
 'asbestos',
 'ash',
 'asphalt',
 'assessment',
 'augen',
 'auger',
 'autoclast',
 'axiolite',
 'backreef',
 'banding',
 'barium',
 'barrier',
 'baryte',
 'basalt',
 'basanite',
 'basic',
 'basin',
 'bathyal',
 'bauxite',
 'beach',
 'bedding',
 'beneficiation',
 'bentonite',
 'beryl',
 'beryllium',
 'bioclast',
 'bioclastic',
 'biogenic',
 'biological',
 'bismuth',
 'bitu

In [66]:
all_words = set(list(analogy_words) + list(relatedness_words) + list(nn_words) + list(cluster_words))
len(all_words)

1115

In [68]:
# check if all entries are single words
for word in all_words:
    if len(word.split()) > 1:
        print(word)

In [79]:
from functions import bert
import pandas as pd

In [83]:
# read the model map in
models_df = pd.read_csv(f"{OUTPUT_FOLDER}/bert_geology_evaluation/BERT_MODEL_MAP.csv")
models_df.head()

Unnamed: 0,dataset,pipeline,tokenizer,path,dim
0,a,v1,pretrained,/nrcan_p2/mlflow_data/42/83d34150748641b89a44b...,786
1,ab,v1,pretrained,/nrcan_p2/mlflow_data/40/902f7aba032a402a92705...,786
2,abd,v1,pretrained,/nrcan_p2/mlflow_data/49/dbc77f67097b4937adcb9...,786
3,a,80,pretrained,/nrcan_p2/mlflow_data/51/9212f6a912664e3d9c949...,786
4,ab,80,pretrained,/nrcan_p2/mlflow_data/52/16b01e423ebe4f08a0111...,786


In [71]:
def create_text_file(vocabulary, filename, model, tokenizer):

    """
    Creates a text file containing a word and its BERT embedding on each line.
    To be loaded into a KeyedVectors object to store keys/vectors.
    """

    assert filename.split('.')[-1] == 'txt', "Filename must be a .txt file"

    f = open(filename, 'w')
    # Write first line for word2vec format
    f.write(str(len(vocabulary)) + ' ' + str(bert_model.config.hidden_size))
    f.write('\n')
    # Write words and embeddings
    for i, word in enumerate(vocabulary):
#         print(i)
        f.write(word + ' ')
        # Create BERT embedding
        vector = bert.bert_embedding(word, tokenizer, model, to_numpy=True)
        for v in vector:
            f.write(str(v) + ' ')
        f.write('\n')

    f.close()

In [93]:
for index, row in models_df.iterrows():
    print(index, f"Creating embeddding file for {row.path}")
    bert_model, bert_tokenizer, bert_config = bert.load_bert(model_name_or_path=row.path)
    model_file = f"{EMBEDDING_FOLDER}/bert.{row.dataset}.{row.pipeline}.{row.tokenizer}.{row.dim}.txt"
    create_text_file(all_words, model_file, bert_model, bert_tokenizer)
    models_df["embedding_path"].iloc[index] = model_file

0 Creating embeddding file for /nrcan_p2/mlflow_data/42/83d34150748641b89a44b74a1beaf799/model/best_model
1 Creating embeddding file for /nrcan_p2/mlflow_data/40/902f7aba032a402a927059958a38e079/model/best_model
2 Creating embeddding file for /nrcan_p2/mlflow_data/49/dbc77f67097b4937adcb914e8895ba2c/model/best_model
3 Creating embeddding file for /nrcan_p2/mlflow_data/51/9212f6a912664e3d9c949cc35db9d60b/model/best_model
4 Creating embeddding file for /nrcan_p2/mlflow_data/52/16b01e423ebe4f08a0111809eef7830e/model/best_model
5 Creating embeddding file for /nrcan_p2/mlflow_data/53/f6d9eefc176241e0b63654c63e97f901/model/best_model
6 Creating embeddding file for /nrcan_p2/mlflow_data/55/97683cfa22c145faa3e0d1cc64d2d22f/model/best_model
7 Creating embeddding file for /nrcan_p2/mlflow_data/54/9583ad61ff32407bb799716cfe197902/model/best_model
8 Creating embeddding file for /nrcan_p2/mlflow_data/58/48c9caa04c6f490d84645e64f8abf99b/model/best_model
9 Creating embeddding file for /nrcan_p2/mlflo

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [95]:
models_df.embedding_path.tolist()[-2]

'/nrcan_p2/data/06_models/bert_nrcan_embeddings/bert.ab.90.geo500.786.txt'

In [96]:
# write embedding map to csv
models_df[["dataset", "pipeline", "tokenizer", "embedding_path", "dim"]].to_csv(f"{OUTPUT_FOLDER}/BERT_EMBEDDING_MAP.csv")