In [1]:
import json
import pickle
import torch
import os
import numpy as np


# *** Select a preprocessing method
def sel_preprocessing(_input="stanza"):
    # 1. Stanza
    if _input.lower() == "stanza":
        import stanza
        return stanza.Pipeline('en')
    
    elif _input.lower() == "spacy":
        # 2. Spacy
        import spacy
        return spacy.load("en_core_web_sm")
        
        
# *** Select a pre-trained Word Embeddings
def sel_pretrained(_input="bert"):
    # 1. BERT
    if _input.lower() == "bert":
        from transformers import BertConfig, BertModel, BertTokenizerFast, BertForSequenceClassification, AdamW, BertPreTrainedModel
        PRETRAINED_BERT = "bert-base-uncased"
        return BertModel.from_pretrained(PRETRAINED_BERT), BertTokenizerFast.from_pretrained(PRETRAINED_BERT)

    # 2. Elmo
    elif _input.lower() == "elmo":
        from allennlp.commands.elmo import ElmoEmbedder
        return None, ElmoEmbedder(cuda_device=0)

    # 3. Glove
    elif _input.lower() == "glove42b":
        glove_PATH = "./glove/glove.42B.300d.pkl"
        with open(glove_PATH, "rb") as file:
            glove = pickle.load(file)
        return None, glove
    
    elif _input.lower() == "glove840b":
        glove_PATH = "./glove/glove.840B.300d.pkl"
        with open(glove_PATH, "rb") as file:
            glove = pickle.load(file)
        return None, glove

In [2]:
class Clause_feature:
    def __init__(self, tk_idx, tk, offset, pos, ner, dep, bert_offset):
        #self.c_idx = c_idx
        self.tk_idx = tk_idx
        self.tk = tk
        self.offset = offset
        self.pos = pos
        self.ner = ner
        self.dep = dep
        self.bert_offset = bert_offset

In [3]:
def tokenization_processing(_datasample, nlp, _pipeline='stanza'):
    cur_clause_feature = []
    # Spacy
    if _pipeline == "spacy":
        doc = nlp(_datasample)
        for tk_idx, tk in enumerate(doc):
            cur_tk = tk.text
            cur_id = tk_idx+1
            cur_head = tk.head.i
            
            cur_tk_start_offset = tk.idx
            cur_tk_end_offset = tk.idx + len(tk)
            
            cur_pos = tk.pos_
            cur_ner = tk.ent_type_
            cur_dep = tk.dep_
            gov_idx = (cur_id, cur_dep, cur_head)
            
            cur_dep_triple = (cur_id, cur_dep, cur_head)
                    
            cur_clause_feature.append(Clause_feature(cur_id, cur_tk, (cur_tk_start_offset, cur_tk_end_offset), cur_pos, cur_ner, cur_dep_triple, ''))
    # Stanza
    elif _pipeline == "stanza":
        doc = nlp(_datasample)
        for sen in doc.sentences:
            for tk in sen.tokens:
                tk_infor_dict = tk.to_dict()[0]
                cur_tk = tk_infor_dict["text"]        
                cur_id = tk_infor_dict['id']
                cur_head = tk_infor_dict['head']

                offsets = tk_infor_dict['misc'].split("|")
                cur_tk_start_offset = int(offsets[0].split("=")[1])
                cur_tk_end_offset = int(offsets[1].split("=")[1])

                cur_pos = tk_infor_dict["xpos"]
                cur_ner = tk_infor_dict["ner"]

                cur_dep = tk_infor_dict["deprel"]
                gov_idx = tk_infor_dict["head"]

                cur_dep_triple = (cur_id, cur_dep, cur_head)

                cur_clause_feature.append(Clause_feature(cur_id, cur_tk, (cur_tk_start_offset, cur_tk_end_offset), cur_pos, cur_ner, cur_dep_triple, ''))
    return cur_clause_feature

In [19]:
_pipeline = "spacy"
emb = "bert"

nlp = sel_preprocessing(_pipeline)
model, embedder = sel_pretrained(emb)

In [5]:
sample_text = """Officials are set to announce details of B.C.'s latest restart plan on Tuesday as daily case counts continue to trend downward and hours after the last round of "circuit breaker" restrictions expired."""

In [6]:
my_result = tokenization_processing(sample_text, nlp, _pipeline)
for x in my_result:
    print("{}\t{}\t{}\t{}\t{}\t{}".format(x.tk_idx, x.tk, x.offset, x.pos, x.dep, x.bert_offset))

1	Officials	(0, 9)	NOUN	(1, 'nsubjpass', 2)	
2	are	(10, 13)	VERB	(2, 'auxpass', 2)	
3	set	(14, 17)	VERB	(3, 'ROOT', 2)	
4	to	(18, 20)	PART	(4, 'aux', 4)	
5	announce	(21, 29)	VERB	(5, 'xcomp', 2)	
6	details	(30, 37)	NOUN	(6, 'dobj', 4)	
7	of	(38, 40)	ADP	(7, 'prep', 5)	
8	B.C.	(41, 45)	PROPN	(8, 'poss', 11)	
9	's	(45, 47)	PART	(9, 'case', 7)	
10	latest	(48, 54)	ADJ	(10, 'amod', 11)	
11	restart	(55, 62)	NOUN	(11, 'compound', 11)	
12	plan	(63, 67)	NOUN	(12, 'pobj', 6)	
13	on	(68, 70)	ADP	(13, 'prep', 4)	
14	Tuesday	(71, 78)	PROPN	(14, 'pobj', 12)	
15	as	(79, 81)	ADP	(15, 'mark', 18)	
16	daily	(82, 87)	ADJ	(16, 'amod', 16)	
17	case	(88, 92)	NOUN	(17, 'compound', 17)	
18	counts	(93, 99)	NOUN	(18, 'nsubj', 18)	
19	continue	(100, 108)	VERB	(19, 'advcl', 4)	
20	to	(109, 111)	PART	(20, 'aux', 20)	
21	trend	(112, 117)	VERB	(21, 'xcomp', 18)	
22	downward	(118, 126)	ADV	(22, 'advmod', 20)	
23	and	(127, 130)	CCONJ	(23, 'cc', 21)	
24	hours	(131, 136)	NOUN	(24, 'conj', 21)	
25	after	(137, 142)	ADP	(2

# version 1 - Bert -> input_ids, attention_mask, token_types_ids

In [13]:
def embedding2preprocessed(datasample, nlp, _pipeline, embedder, emb="bert"):
    preprocessed_result = tokenization_processing(datasample, nlp, _pipeline)
    
    if emb == "bert":
        # bert tokenized result
        tokenized_result = embedder(datasample, return_offsets_mapping=True)

        bert2preprocessed_offset = []
        for p_idx, p_result in enumerate(preprocessed_result):
            p_offset = p_result.offset
            p_start_offset = p_offset[0]
            p_end_offset = p_offset[1]

            cur_offset_list = []
            for b_idx, b_offset in enumerate(tokenized_result['offset_mapping'][1:-1]):
                b_start_offset = b_offset[0]
                b_end_offset = b_offset[1]

                # offset of the bert tokenized token is in the preprocessed offset
                if b_start_offset >= p_start_offset and b_end_offset <= p_end_offset:
                    if len(cur_offset_list) == 0:
                        cur_offset_list = [b_idx+1]
                    else:
                        cur_offset_list.append(b_idx+1)


            p_result.bert_offset = cur_offset_list

        return {'input_ids':tokenized_result['input_ids'],
                 'attention_mask':tokenized_result['attention_mask'],
                 'token_type_ids':tokenized_result['token_type_ids'],
                 'preprocessed_offset_match':[x.bert_offset for x in preprocessed_result],
                 'preprocessed_dep': [x.dep for x in preprocessed_result]}
    
    elif emb =="elmo":
        elmo_embedding = embedder.embed_sentence([x.tk for x in preprocessed_result])[0]
        elmo_embedding = elmo_embedding.cpu().clone().detach()
        
        return {'elmo_emb':elmo_embedding,
                 'preprocessed_offset_match':[[x.tk_idx] for x in preprocessed_result],
                 'preprocessed_dep': [x.dep for x in preprocessed_result]}
    
    elif emb == 'glove':
        glove_emb = []
        
        for x in preprocessed_result:
            cur_tk = x.tk.lower()
            if cur_tk in embedder:
                cur_emb = embedder[cur_tk]
            else:
                cur_emb = np.zeros(300)
            glove_emb.append(cur_emb)
        glove_emb = torch.tensor(glove_emb)
        
        return {'glove_emb':glove_emb,
                 'preprocessed_offset_match':[[x.tk_idx] for x in preprocessed_result],
                 'preprocessed_dep': [x.dep for x in preprocessed_result]}

In [15]:
_pipeline = "spacy"
emb = "bert"

nlp = sel_preprocessing(_pipeline)
embedder = sel_pretrained(emb)

preprocessed_bert_input_result = embedding2preprocessed(sample_text, nlp, _pipeline, embedder, emb=emb)
print(preprocessed_bert_input_result['input_ids'])
print(preprocessed_bert_input_result['attention_mask'])
print(preprocessed_bert_input_result['token_type_ids'])
print(preprocessed_bert_input_result['preprocessed_offset_match'])
print(preprocessed_bert_input_result['preprocessed_dep'])

[101, 4584, 2024, 2275, 2000, 14970, 4751, 1997, 1038, 1012, 1039, 1012, 1005, 1055, 6745, 23818, 2933, 2006, 9857, 2004, 3679, 2553, 9294, 3613, 2000, 9874, 14047, 1998, 2847, 2044, 1996, 2197, 2461, 1997, 1000, 4984, 24733, 1000, 9259, 13735, 1012, 102]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[[1], [2], [3], [4], [5], [6], [7], [8, 9, 10, 11], [12, 13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40]]
[(1, 'nsubjpass', 2), (2, 'auxpass', 2), (3, 'ROOT', 2), (4, 'aux', 4), (5, 'xcomp', 2), (6, 'dobj', 4), (7, 'prep', 5), (8, 'poss', 11), (9, 'case', 7), (10, 'amod', 11), (11, 'compound', 11), (12, 'pobj', 6), (13, 'prep', 4), (14, 'pobj', 12), (15, 'mark', 18), (1

# version 3: frozen embeddings

In [4]:
def embedding2preprocessed(datasample, nlp, _pipeline, embedder, model, emb="bert"):
    # preprocessed result
    preprocessed_result = tokenization_processing(datasample, nlp, _pipeline)
    
    if emb == "bert":
        # bert tokenized result
        tokenized_result = embedder(datasample, return_offsets_mapping=True, return_tensors='pt')
        bert_embedding = model(input_ids=tokenized_result['input_ids'],
                              attention_mask=tokenized_result['attention_mask'],
                              token_type_ids=tokenized_result['token_type_ids'])[0]
        
        bert_embedding = torch.squeeze(bert_embedding, 0)

        bert2preprocessed_offset = []
        for p_idx, p_result in enumerate(preprocessed_result):
            p_offset = p_result.offset
            p_start_offset = p_offset[0]
            p_end_offset = p_offset[1]

            cur_offset_list = []
            for b_idx, b_offset in enumerate(tokenized_result['offset_mapping'][1:-1]):
                b_start_offset = b_offset[0]
                b_end_offset = b_offset[1]

                # offset of the bert tokenized token is in the preprocessed offset
                if b_start_offset >= p_start_offset and b_end_offset <= p_end_offset:
                    if len(cur_offset_list) == 0:
                        cur_offset_list = [b_idx+1]
                    else:
                        cur_offset_list.append(b_idx+1)

            p_result.bert_offset = cur_offset_list

        return {'bert_emb':bert_embedding,
                 'preprocessed_offset_match':[x.bert_offset for x in preprocessed_result],
                 'preprocessed_dep': [x.dep for x in preprocessed_result]}
    
    elif emb =="elmo":
        elmo_embedding = embedder.embed_sentence([x.tk for x in preprocessed_result])[0]
        elmo_embedding = torch.tensor(elmo_embedding)
        
        return {'elmo_emb':elmo_embedding,
                 'preprocessed_offset_match':[[x.tk_idx] for x in preprocessed_result],
                 'preprocessed_dep': [x.dep for x in preprocessed_result]}
    
    elif 'glove' in emb:
        glove_emb = []
        
        for x in preprocessed_result:
            cur_tk = x.tk.lower()
            if cur_tk in embedder:
                cur_emb = embedder[cur_tk]
            else:
                cur_emb = np.zeros(300)
            glove_emb.append(cur_emb)
        glove_emb = torch.tensor(glove_emb)
        
        return {'glove_emb':glove_emb,
                 'preprocessed_offset_match':[[x.tk_idx] for x in preprocessed_result],
                 'preprocessed_dep': [x.dep for x in preprocessed_result]}

In [6]:
sample_text = """Officials are set to announce details of B.C.'s latest restart plan on Tuesday as daily case counts continue to trend downward and hours after the last round of "circuit breaker" restrictions expired."""

In [7]:
_pipeline = "spacy"
emb = "bert"

nlp = sel_preprocessing(_pipeline)
model, embedder = sel_pretrained(emb)

preprocessed_bert_input_result = embedding2preprocessed(sample_text, nlp, _pipeline, embedder, model, emb)
print(preprocessed_bert_input_result['bert_emb'])
print(preprocessed_bert_input_result['preprocessed_offset_match'])
print(preprocessed_bert_input_result['preprocessed_dep'])

tensor([[ 0.0033, -0.2353,  0.7526,  ...,  0.0147,  0.5923,  0.0296],
        [ 0.3014, -0.0032, -0.1142,  ..., -0.2120,  0.8723,  0.0863],
        [ 0.6135, -0.1681,  0.3704,  ..., -0.2918,  0.3635, -0.3355],
        ...,
        [ 0.2405, -0.1017,  0.4480,  ...,  0.2065, -0.1057,  0.0970],
        [ 0.4867,  0.1187,  0.1823,  ...,  0.2124, -0.1039, -0.2119],
        [ 0.4026,  0.3242,  0.3778,  ...,  0.2004, -0.3061, -0.2883]],
       grad_fn=<SqueezeBackward1>)
[[1], [2], [3], [4], [5], [6], [7], [8, 9, 10, 11], [12, 13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40]]
[(1, 'nsubjpass', 2), (2, 'auxpass', 2), (3, 'ROOT', 2), (4, 'aux', 4), (5, 'xcomp', 2), (6, 'dobj', 4), (7, 'prep', 5), (8, 'poss', 11), (9, 'case', 7), (10, 'amod', 11), (11, 'compound', 11), (12, 'pobj', 6), (13, 'prep', 4), (14, 'pobj', 12), (15, 'mark', 18), (16, 'amod', 16), (17, 'compound', 17), (18

In [8]:
_pipeline = "spacy"
emb = "elmo"

nlp = sel_preprocessing(_pipeline)
model, embedder = sel_pretrained(emb)

preprocessed_bert_input_result = embedding2preprocessed(sample_text, nlp, _pipeline, embedder, model, emb)
print(preprocessed_bert_input_result['elmo_emb'])
print(preprocessed_bert_input_result['preprocessed_offset_match'])
print(preprocessed_bert_input_result['preprocessed_dep'])

tensor([[ 0.3627,  0.5986, -0.5457,  ...,  0.2147,  0.2961, -0.2138],
        [-0.0312,  0.0804, -0.2824,  ...,  0.0382,  0.4789,  0.0865],
        [-0.0687, -0.0184,  0.8037,  ...,  0.3777, -0.1891,  0.1027],
        ...,
        [-0.3913,  0.2549, -0.0263,  ...,  0.0189,  0.5630,  1.2168],
        [-0.5887,  1.0959, -0.0608,  ..., -0.5536,  0.3853,  0.7061],
        [-0.8872, -0.2004, -1.0601,  ..., -0.2655,  0.2115,  0.1977]])
[[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36]]
[(1, 'nsubjpass', 2), (2, 'auxpass', 2), (3, 'ROOT', 2), (4, 'aux', 4), (5, 'xcomp', 2), (6, 'dobj', 4), (7, 'prep', 5), (8, 'poss', 11), (9, 'case', 7), (10, 'amod', 11), (11, 'compound', 11), (12, 'pobj', 6), (13, 'prep', 4), (14, 'pobj', 12), (15, 'mark', 18), (16, 'amod', 16), (17, 'compound', 17), (18, 'nsubj', 18), (19, 'advcl', 4), (20, 'aux', 20), 

In [9]:
_pipeline = "spacy"
emb = "glove42b"

nlp = sel_preprocessing(_pipeline)
model, embedder = sel_pretrained(emb)

preprocessed_bert_input_result = embedding2preprocessed(sample_text, nlp, _pipeline, embedder, model, emb)
print(preprocessed_bert_input_result['glove_emb'])
print(preprocessed_bert_input_result['preprocessed_offset_match'])
print(preprocessed_bert_input_result['preprocessed_dep'])

tensor([[-0.2091,  0.1485,  0.0992,  ...,  0.2235, -0.0291, -0.1808],
        [-0.3214,  0.1153,  0.0094,  ..., -0.2877,  0.2200,  0.1956],
        [-0.3182,  0.1454, -0.1936,  ...,  0.1443, -0.0095, -0.1202],
        ...,
        [ 0.0442,  0.1085,  0.2111,  ..., -0.5376, -0.5948,  0.0439],
        [-0.3451, -0.2852,  0.0509,  ..., -0.3698, -0.5903,  0.0912],
        [ 0.1088,  0.0022,  0.2221,  ..., -0.2970,  0.1594, -0.1490]],
       dtype=torch.float64)
[[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36]]
[(1, 'nsubjpass', 2), (2, 'auxpass', 2), (3, 'ROOT', 2), (4, 'aux', 4), (5, 'xcomp', 2), (6, 'dobj', 4), (7, 'prep', 5), (8, 'poss', 11), (9, 'case', 7), (10, 'amod', 11), (11, 'compound', 11), (12, 'pobj', 6), (13, 'prep', 4), (14, 'pobj', 12), (15, 'mark', 18), (16, 'amod', 16), (17, 'compound', 17), (18, 'nsubj', 18), (19, 'a

In [10]:
_pipeline = "spacy"
emb = "glove840b"

nlp = sel_preprocessing(_pipeline)
model, embedder = sel_pretrained(emb)

preprocessed_bert_input_result = embedding2preprocessed(sample_text, nlp, _pipeline, embedder, model, emb)
print(preprocessed_bert_input_result['glove_emb'])
print(preprocessed_bert_input_result['preprocessed_offset_match'])
print(preprocessed_bert_input_result['preprocessed_dep'])

tensor([[-0.2082, -0.2510,  0.5810,  ...,  0.1680,  0.2455, -0.0169],
        [-0.1986, -0.0628, -0.3661,  ..., -0.5845,  0.2788, -0.2621],
        [ 0.3637,  0.0710, -0.0676,  ..., -0.0436, -0.0298, -0.2876],
        ...,
        [ 0.2196, -0.1482, -0.3005,  ...,  0.1036,  0.5923,  0.4303],
        [ 0.5271, -0.5270,  0.0362,  ...,  0.2272,  0.2620,  0.0738],
        [ 0.0120,  0.2075, -0.1258,  ...,  0.1387, -0.3605, -0.0350]],
       dtype=torch.float64)
[[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36]]
[(1, 'nsubjpass', 2), (2, 'auxpass', 2), (3, 'ROOT', 2), (4, 'aux', 4), (5, 'xcomp', 2), (6, 'dobj', 4), (7, 'prep', 5), (8, 'poss', 11), (9, 'case', 7), (10, 'amod', 11), (11, 'compound', 11), (12, 'pobj', 6), (13, 'prep', 4), (14, 'pobj', 12), (15, 'mark', 18), (16, 'amod', 16), (17, 'compound', 17), (18, 'nsubj', 18), (19, 'a