In [1]:
import numpy as np
import pandas as pd
import os
from model import HierAttModel
import gensim
import torch
from gensim.models import Word2Vec
from word_embeder import MyTokenizer
import json
from sklearn import preprocessing
from nltk.tokenize import sent_tokenize, word_tokenize

In [46]:
## backgoud color control
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
def mk_weight_string(str_list, w_list):
    temp_str = []
    for string, weight in zip(str_list, w_list):
        temp_str += ['<span style="background-color:rgba(255,0,0,' +str(weight)+ ');">' + string + '</span>']
    return " ".join(temp_str)


In [3]:
word2vec_config_path = "model/config.json"
word2vec_model_path = "model/word2vec.model"
HAN_mdoel_path = "model/1/model4.pwf"
HAN_config_path = "model/1/config.json"
tokenizer_name = "word_tokenizer"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [5]:
##Load word2vec config
with open(word2vec_config_path, 'r') as f:
    word2vec_config = json.load(f)
word2vec_config = Struct(**word2vec_config)

##Load word2vec model
word2vec_model = Word2Vec.load(word2vec_model_path)
embedding = word2vec_model.wv.vectors
dict_size = len(embedding)
index2word = word2vec_model.wv.index2word
word_vec_dim = word2vec_model.vector_size

#Insert Unknown Token
unknown_word = preprocessing.normalize(np.random.rand(1, word_vec_dim))
embedding = torch.from_numpy(np.concatenate([unknown_word, embedding], axis=0).astype(np.float))
index2word = ['[UNK]'] + index2word
dict_size += 1
word2index = {text : index for index, text in enumerate(index2word)}

In [6]:
##Load tokenizer
tokenizer = MyTokenizer(tokenizer_name)

In [7]:
##Load HAN config
with open(HAN_config_path, 'r') as f:
    HAN_config = json.load(f)
HAN_config = Struct(**HAN_config)

##Load HAN model
model = HierAttModel(input_size=dict_size,
                     word_vec_dim=word_vec_dim,
                     hidden_size=HAN_config.hidden_size,
                     num_class=4,
                     running_size=HAN_config.running_size,
                     n_layers=HAN_config.n_layers,
                     device=device
                     ).to(device)
model.set_embedding(embedding)
check_point = torch.load(HAN_mdoel_path)
model.load_state_dict(check_point["model"])

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [57]:
doc="""
BERT is a method of pre-training language representations, meaning that we train a general-purpose "language understanding" model on a large text corpus (like Wikipedia), and then use that model for downstream NLP tasks that we care about (like question answering). BERT outperforms previous methods because it is the first unsupervised, deeply bidirectional system for pre-training NLP.
"""

In [58]:
tokens = [[word for word in
                           tokenizer.tokenize(sentences, lemma=False)] for sentences in sent_tokenize(doc)]
temp_index = [[word2index.get(word) if word2index.get(word) else 0 for word in
                           tokenizer.tokenize(sentences)] for sentences in sent_tokenize(doc)]
for sentence in temp_index:
    ##Even though there is no word after preprocess procedure, must put something like "[UNK]" to run machine
    if len(sentence) == 0:
        sentence.extend([0])

temp_sent_len = len(temp_index)
temp_word_len = [len(sent) for sent in temp_index]

max_sent_len = temp_sent_len
max_word_len = max(temp_word_len)

for sent in temp_index:
    if len(sent) < max_word_len:
        extended_words = [0 for _ in range(max_word_len - len(sent))]
        sent.extend(extended_words)

if len(temp_index) < max_sent_len:
    extended_sentences = [[0 for _ in range(max_word_len)] for _ in
                          range(max_sent_len - len(temp_index))]
    temp_index.extend(extended_sentences)

temp_index = [sentences[:max_word_len] for sentences in temp_index][:max_sent_len]

if len(temp_word_len) < max_sent_len:
    extended_word_len = [0 for _ in range(self.max_sent_len - len(temp_word_len))]
    temp_word_len.extend(extended_word_len)
temp_word_len = temp_word_len[:max_sent_len]

temp_index = torch.tensor(temp_index)
temp_sent_len = torch.tensor(temp_sent_len)
temp_word_len = torch.tensor(temp_word_len)

In [59]:
temp_index = temp_index.unsqueeze(0).to(device)
temp_sent_len = temp_sent_len.unsqueeze(0).to(device)
temp_word_len = temp_word_len.unsqueeze(0).to(device)
y_hat, weights, sent_weights = model(temp_index, temp_sent_len, temp_word_len)
ps = torch.exp(y_hat)
top_p, top_class = ps.topk(1, dim=1)
weights = weights.tolist()
print(top_class)

tensor([[1]], device='cuda:0')


In [60]:
for sent, weight in zip(tokens, weights):
    temp_str = mk_weight_string(sent, weight)
    printmd(temp_str)

<span style="background-color:rgba(255,0,0,0.013922115787863731);">BERT</span> <span style="background-color:rgba(255,0,0,0.022177787497639656);">is</span> <span style="background-color:rgba(255,0,0,0.022495588287711143);">a</span> <span style="background-color:rgba(255,0,0,0.021066157147288322);">method</span> <span style="background-color:rgba(255,0,0,0.019460387527942657);">of</span> <span style="background-color:rgba(255,0,0,0.015218591317534447);">pre-training</span> <span style="background-color:rgba(255,0,0,0.015542146749794483);">language</span> <span style="background-color:rgba(255,0,0,0.014453896321356297);">representations</span> <span style="background-color:rgba(255,0,0,0.020396994426846504);">,</span> <span style="background-color:rgba(255,0,0,0.018746059387922287);">meaning</span> <span style="background-color:rgba(255,0,0,0.023213043808937073);">that</span> <span style="background-color:rgba(255,0,0,0.02284332923591137);">we</span> <span style="background-color:rgba(255,0,0,0.018137194216251373);">train</span> <span style="background-color:rgba(255,0,0,0.01934722438454628);">a</span> <span style="background-color:rgba(255,0,0,0.01467967126518488);">general-purpose</span> <span style="background-color:rgba(255,0,0,0.02086767368018627);">``</span> <span style="background-color:rgba(255,0,0,0.01817169040441513);">language</span> <span style="background-color:rgba(255,0,0,0.015617145225405693);">understanding</span> <span style="background-color:rgba(255,0,0,0.01802458055317402);">''</span> <span style="background-color:rgba(255,0,0,0.021063458174467087);">model</span> <span style="background-color:rgba(255,0,0,0.01882774382829666);">on</span> <span style="background-color:rgba(255,0,0,0.02261020615696907);">a</span> <span style="background-color:rgba(255,0,0,0.022140467539429665);">large</span> <span style="background-color:rgba(255,0,0,0.017904847860336304);">text</span> <span style="background-color:rgba(255,0,0,0.013319865800440311);">corpus</span> <span style="background-color:rgba(255,0,0,0.022732404991984367);">(</span> <span style="background-color:rgba(255,0,0,0.024310944601893425);">like</span> <span style="background-color:rgba(255,0,0,0.02019471861422062);">Wikipedia</span> <span style="background-color:rgba(255,0,0,0.027495251968503);">)</span> <span style="background-color:rgba(255,0,0,0.027174893766641617);">,</span> <span style="background-color:rgba(255,0,0,0.025396117940545082);">and</span> <span style="background-color:rgba(255,0,0,0.02264234609901905);">then</span> <span style="background-color:rgba(255,0,0,0.02600013092160225);">use</span> <span style="background-color:rgba(255,0,0,0.026192210614681244);">that</span> <span style="background-color:rgba(255,0,0,0.024438904598355293);">model</span> <span style="background-color:rgba(255,0,0,0.02153320610523224);">for</span> <span style="background-color:rgba(255,0,0,0.016504479572176933);">downstream</span> <span style="background-color:rgba(255,0,0,0.014806346036493778);">NLP</span> <span style="background-color:rgba(255,0,0,0.01667374186217785);">tasks</span> <span style="background-color:rgba(255,0,0,0.02145756408572197);">that</span> <span style="background-color:rgba(255,0,0,0.020830722525715828);">we</span> <span style="background-color:rgba(255,0,0,0.020226502791047096);">care</span> <span style="background-color:rgba(255,0,0,0.01943289488554001);">about</span> <span style="background-color:rgba(255,0,0,0.022003985941410065);">(</span> <span style="background-color:rgba(255,0,0,0.024233434349298477);">like</span> <span style="background-color:rgba(255,0,0,0.02220672369003296);">question</span> <span style="background-color:rgba(255,0,0,0.014752235263586044);">answering</span> <span style="background-color:rgba(255,0,0,0.025241073220968246);">)</span> <span style="background-color:rgba(255,0,0,0.02327110804617405);">.</span>

<span style="background-color:rgba(255,0,0,0.049713362008333206);">BERT</span> <span style="background-color:rgba(255,0,0,0.051862701773643494);">outperforms</span> <span style="background-color:rgba(255,0,0,0.05578713119029999);">previous</span> <span style="background-color:rgba(255,0,0,0.056381069123744965);">methods</span> <span style="background-color:rgba(255,0,0,0.061875294893980026);">because</span> <span style="background-color:rgba(255,0,0,0.06484333425760269);">it</span> <span style="background-color:rgba(255,0,0,0.0698554590344429);">is</span> <span style="background-color:rgba(255,0,0,0.06352352350950241);">the</span> <span style="background-color:rgba(255,0,0,0.05695470795035362);">first</span> <span style="background-color:rgba(255,0,0,0.046802420169115067);">unsupervised</span> <span style="background-color:rgba(255,0,0,0.059989552944898605);">,</span> <span style="background-color:rgba(255,0,0,0.05552929267287254);">deeply</span> <span style="background-color:rgba(255,0,0,0.04709845036268234);">bidirectional</span> <span style="background-color:rgba(255,0,0,0.05561910942196846);">system</span> <span style="background-color:rgba(255,0,0,0.054653946310281754);">for</span> <span style="background-color:rgba(255,0,0,0.04529276117682457);">pre-training</span> <span style="background-color:rgba(255,0,0,0.04206357151269913);">NLP</span> <span style="background-color:rgba(255,0,0,0.06215429678559303);">.</span>

In [45]:
##Color Sample
printmd('<span style="background-color: #FFFF00">Marked text</span>')
printmd('<span style="background-color:rgba(255,0,0,0.5);">Marked text</span>')

<span style="background-color: #FFFF00">Marked text</span>

<span style="background-color:rgba(255,0,0,0.5);">Marked text</span>

In [20]:
printmd("**bold and blue**", color="blue")

TypeError: printmd() got an unexpected keyword argument 'color'