In [1]:
!nvidia-smi

Tue Feb 23 09:18:37 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:01:00.0 Off |                  N/A |
| 27%   29C    P8    16W / 250W |  10451MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:02:00.0 Off |                  N/A |
| 27%   28C    P8     1W / 250W |      6MiB / 11019MiB |      0%      Default |
|       

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
tf.__version__

'2.3.0'

In [6]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [7]:
import numpy as np
from pathlib import Path
import pickle

from collections import Counter
from gensim.models import Word2Vec, FastText

from sklearn.model_selection import train_test_split

## 0. preprocessing

### build the pretrained_word2vec 

In [8]:
# load w2v model
w2v_model_cb = Word2Vec.load("/home/MOFdictionary/libs/word2vec/word2vec_cbow/word2vec_cbow.model")
w2v_model_sg = Word2Vec.load("/home/MOFdictionary/libs/word2vec/word2vec_skipgram/word2vec_skipgram.model")

ft_model_cb = Word2Vec.load("/home/MOFdictionary/libs/word2vec/fasttext_cbow/fasttext_cbow.model")
ft_model_sg = Word2Vec.load("/home/MOFdictionary/libs/word2vec/fasttext_skipgram/fasttext_skipgram.model")

wv_model = w2v_model_sg

In [9]:
len(wv_model.wv.vocab)

105075

In [10]:
dim_embedding = 100
embedding_matrix = tf.concat([tf.zeros((2, dim_embedding)), wv_model.wv.vectors], axis=0)
embedding_matrix.shape

TensorShape([105077, 100])

In [11]:
words = ["<PAD>","<UNK>"] + wv_model.wv.index2word
len(words)

105077

### make vocab

In [12]:
word2id = {}
for i, word in enumerate(words):
    word2id[word] = i
    
id2word = {}
for i, word in enumerate(words):
    id2word[i] = word

In [13]:
#pickle.dump(word2id, open("/home/MOFdictionary/libs/mer/vocab/word2id","wb"))
#pickle.dump(id2word, open("/home/MOFdictionary/libs/mer/vocab/id2word","wb"))

In [14]:
def get_bio_tags(tags):
    
    bio_tags = []
    
    for i, tag in enumerate(tags):
        if i == 0: # B-
            
            if tag == 1: #pre
                bio_tag = 3 #B-pre
                
            elif tag == 2: #etc
                bio_tag = 5 #B-Etc
                
            elif tag == 3: #tar
                bio_tag = 1 #B-tar
                
            else:
                bio_tag = 0
        
        else:
            if tag == 1: #pre
                if tags[i-1] == 1:
                    bio_tag = 4 #I-pre
                else:
                    bio_tag = 3 #B-pre
                    
            elif tag == 2: #etc
                if tags[i-1] == 2:
                    bio_tag = 6 #I-etc
                else:
                    bio_tag = 5 #B-etc
                    
            elif tag == 3: #tar
                if tags[i-1] == 3:
                    bio_tag = 2 # I-tar
                else:
                    bio_tag = 1 # B-atr
                    
            else:
                bio_tag = 0
        
        bio_tags.append(bio_tag)
    
    return bio_tags         

In [15]:
sents = []
map_sents = []
bio_tags = []
for filepath in Path("/home/MOFdictionary/example/Dataset_MER/").glob("*.sav"):
    ners = pickle.load(open(filepath,"rb"))
    for ner in ners: #ners = [[(word,tag),(word,tag),,,],[],,,]
        words, tags = zip(*ner)
        sents.append(words)
        map_sents.append(list(map(lambda x: word2id[x] if x in word2id else 1, words)))
        bio_tags.append(get_bio_tags(tags)) 

In [16]:
max_length = 100
num_tags = 7

x_data = pad_sequences(map_sents, maxlen=max_length, padding="post")
y_data = pad_sequences(bio_tags, maxlen=max_length, padding="post")
#y_data = to_categorical(y_data, num_classes=num_tags) # make one-hot sequence

print(x_data.shape, y_data.shape)

(11173, 100) (11173, 100)


In [17]:
x_train, x_test_val, y_train, y_test_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=0.5, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(8938, 100) (1117, 100) (8938, 100) (1117, 100)


# 2. character Embedding

In [19]:
## making charlist
charlist = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ~†‡°·-,;.!?:’/\|_@#$%ˆ&*˜‘+-=()[]{}<>"

# char2idx
char2id = {}

char2id["PAD"] = 0
char2id["UNK"] = 1
for i, char in enumerate(charlist):
    char2id[char] = i+2
print(f"# of charlist = {len(char2id)}")

# of charlist = 101


In [20]:
#pickle.dump(char2id, open("/home/MOFdictionary/libs/mer/vocab/char2id","wb"))

In [21]:
def get_pad_map_char_sequences(x_data):
    x_data_char  = np.empty([x_data.shape[0], x_data.shape[1], 30])
    for i, sent in enumerate(x_data):

        map_sent = []

        for word in sent:

            map_word = list(map(lambda x : char2id[x] if x in char2id else 1, id2word[word]))
            if map_word == [100,43, 28, 31,101]: #<PAD> -> 0
                map_word = [0]
            map_sent.append(map_word)

        x_data_char[i] = pad_sequences(map_sent,padding="post",maxlen=30)
    x_data_char.shape
    return x_data_char

In [22]:
x_train_char = get_pad_map_char_sequences(x_train)
x_test_char = get_pad_map_char_sequences(x_test)
x_val_char = get_pad_map_char_sequences(x_val)
x_train_char.shape, x_test_char.shape, x_val_char.shape

((8938, 100, 30), (1117, 100, 30), (1118, 100, 30))

# Keras

In [18]:
from tf2crf import CRF, ModelWithCRFLoss
from tensorflow.keras.layers import Input, Embedding, Bidirectional, RNN, LSTMCell, TimeDistributed, Concatenate, Dense

In [19]:
def mask_scores(y_true, y_pred, lens_text):
    mask = tf.sequence_mask(lens_text, maxlen=100)
    m = tf.keras.metrics.Accuracy()
    acc = m(y_true, y_pred, sample_weight=mask)
    # precision, recall, f1 for multi-class
    
    masked_y_true = y_true[mask]
    masked_y_pred = y_pred[mask]
    accuracy = accuracy_score(masked_y_true, masked_y_pred)
    precision = precision_score(masked_y_true, masked_y_pred, average=None)
    recall = recall_score(masked_y_true, masked_y_pred, average=None)
    f1 = f1_score(masked_y_true, masked_y_pred, average=None)
    return acc.numpy(), precision, recall, f1

## <span style="color:red">BILSTMCRF</span>

In [20]:
num_vocab = embedding_matrix.shape[0]
dim_embedding = embedding_matrix.shape[1]
max_length = 100
num_tags = 7

crf = CRF(dtype=tf.float32)

input_word = Input(shape=(100,))

x = Embedding(input_dim=num_vocab, output_dim=dim_embedding, input_length=max_length, weights=[embedding_matrix], trainable=False, mask_zero=True)(input_word)
x = Bidirectional(RNN(LSTMCell(units=50, recurrent_dropout=0.3),return_sequences=True))(x)
x = TimeDistributed(Dense(num_tags))(x)
x = crf(x)

output_ = x #  y_pred, x, lens_text, chain_kernel

base_model = tf.keras.Model(input_word,output_)
base_model.summary()
bilstmcrf = ModelWithCRFLoss(base_model)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          10507700  
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 100)          60400     
_________________________________________________________________
time_distributed (TimeDistri (None, 100, 7)            707       
_________________________________________________________________
crf (CRF)                    ((None, 100), (None, 100, 51        
Total params: 10,568,858
Trainable params: 61,156
Non-trainable params: 10,507,702
_________________________________________________________________


In [21]:
bilstmcrf.compile(optimizer=tf.keras.optimizers.Adam())

In [22]:
bilstmcrf.fit(x_train, y_train, batch_size=100, epochs=50, validation_data=(x_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f22f84432e8>

In [23]:
y_pred, _, lens_text, _ = bilstmcrf(x_test)
mask_scores(y_test, y_pred, lens_text)

(0.99201393,
 array([0.99829794, 0.81793478, 0.57446809, 0.93447293, 0.9       ,
        0.88142292, 0.87179487]),
 array([0.99495335, 0.94357367, 0.9       , 0.93447293, 0.91304348,
        0.91393443, 0.90666667]),
 array([0.99662284, 0.87627365, 0.7012987 , 0.93447293, 0.90647482,
        0.89738431, 0.88888889]))

## <span style="color:red">BILSTMCRF-Char</span>

In [25]:
# word model
num_vocab = embedding_matrix.shape[0]
dim_embedding = embedding_matrix.shape[1]
max_length = 100
num_tags = 7


# Char model
charlist = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ~†‡°·-,;.!?:’/\|_@#$%ˆ&*˜‘+-=()[]{}<>"
char_max_len = 30
char_dim_embedding = 100

crf = CRF(dtype=tf.float32)

###
input_char = Input(shape=(None, 30))

x_char = Embedding(input_dim=len(charlist)+2, output_dim=char_dim_embedding, input_length=char_max_len, trainable=True, mask_zero=True)(input_char)
x_char = TimeDistributed(Bidirectional(RNN(LSTMCell(units=50))))(x_char)
x_char = TimeDistributed(Dense(units=50))(x_char)
# Word model
input_word = Input(shape=(None,))

x = Embedding(input_dim=num_vocab, output_dim=dim_embedding, input_length=max_length, weights=[embedding_matrix], trainable=False, mask_zero=True)(input_word)

# Concat

x = Concatenate(axis=-1)([x, x_char])


x = Bidirectional(RNN(LSTMCell(units=50, recurrent_dropout=0.3),return_sequences=True))(x)
x = TimeDistributed(Dense(num_tags))(x)
x = crf(x)

output_ = x #  y_pred, x, lens_text, chain_kernel
###
base_model = tf.keras.Model([input_word, input_char], output_)
base_model.summary()
bilstmcrf_char = ModelWithCRFLoss(base_model)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 30)]   0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 30, 100 10200       input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, None, 100)    60400       embedding[0][0]                  
_______________________________________________________________________________________

In [26]:
bilstmcrf_char.compile(optimizer=tf.keras.optimizers.Adam())

In [28]:
checkpoint_filepath = "./keras/CallBacks/checkpoint"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, mode='min', monitor='crf_loss_val', save_best_only=False)
callbacks_list = [checkpoint]

In [29]:
bilstmcrf_char.fit((x_train, x_train_char), y_train, batch_size=100, epochs=100, validation_data=((x_val, x_val_char), y_val), callbacks=callbacks_list)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100

KeyboardInterrupt: 

In [None]:
y_pred, _, lens_text, _ = bilstmcrf_char((x_test, x_test_char))
mask_scores(y_test, y_pred, lens_text)

In [None]:
bilstmcrf_char.load_weights(checkpoint_filepath)

In [30]:
y_pred, _, lens_text, _ = bilstmcrf_char((x_test, x_test_char))
mask_scores(y_test, y_pred, lens_text)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



(0.99343276,
 array([0.99736943, 0.89156627, 0.81818182, 0.94252874, 0.91428571,
        0.91286307, 0.85333333]),
 array([0.99690416, 0.92789969, 0.9       , 0.93447293, 0.92753623,
        0.90163934, 0.85333333]),
 array([0.99713674, 0.9093702 , 0.85714286, 0.93848355, 0.92086331,
        0.90721649, 0.85333333]))

In [31]:
bilstmcrf_char.save("./keras/bilstmcrf_char_tmp")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./keras/bilstmcrf_char_tmp/assets


In [57]:
#load_model = tf.keras.models.load_model("./keras/bilstmcrf_char")

In [54]:
y_pred, _, lens_text, _ = load_model((x_test, x_test_char))
mask_scores(y_test, y_pred, lens_text)

NameError: name 'load_model' is not defined

In [57]:
a

(0.99449575,
 array([0.99790595, 0.95806452, 0.84375   , 0.92565598, 0.85263158,
        0.8793456 , 0.88489209]),
 array([0.99794734, 0.93838863, 0.85714286, 0.93245228, 0.83505155,
        0.89958159, 0.86013986]),
 array([0.99792664, 0.9481245 , 0.8503937 , 0.9290417 , 0.84375   ,
        0.8893485 , 0.87234043]))

# Kfold