In [1]:
!nvidia-smi

Tue Jun 22 14:08:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.03   Driver Version: 450.119.03   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:01:00.0 Off |                  N/A |
| 27%   31C    P8    16W / 250W |     26MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:02:00.0 Off |                  N/A |
| 27%   31C    P8     1W / 250W |  10450MiB / 11019MiB |      0%      Default |
|       

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
tf.__version__

'2.3.0'

In [6]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [7]:
import numpy as np
from pathlib import Path
import pickle

from collections import Counter
from gensim.models import Word2Vec, FastText

from sklearn.model_selection import train_test_split

## 0. preprocessing

### build the pretrained_word2vec 

In [8]:
# load w2v model
w2v_model_cb = Word2Vec.load("/home/MOFdictionary/libs/word2vec/word2vec_cbow/word2vec_cbow.model")
w2v_model_sg = Word2Vec.load("/home/MOFdictionary/libs/word2vec/word2vec_skipgram/word2vec_skipgram.model")

ft_model_cb = Word2Vec.load("/home/MOFdictionary/libs/word2vec/fasttext_cbow/fasttext_cbow.model")
ft_model_sg = Word2Vec.load("/home/MOFdictionary/libs/word2vec/fasttext_skipgram/fasttext_skipgram.model")

wv_model = w2v_model_sg

In [9]:
len(wv_model.wv.vocab)

105075

In [10]:
dim_embedding = 100
embedding_matrix = tf.concat([tf.zeros((2, dim_embedding)), wv_model.wv.vectors], axis=0)
embedding_matrix.shape

TensorShape([105077, 100])

In [11]:
words = ["<PAD>","<UNK>"] + wv_model.wv.index2word
len(words)

105077

### make vocab

In [12]:
word2id = {}
for i, word in enumerate(words):
    word2id[word] = i
    
id2word = {}
for i, word in enumerate(words):
    id2word[i] = word

In [13]:
#pickle.dump(word2id, open("/home/MOFdictionary/libs/mer/vocab/word2id","wb"))
#pickle.dump(id2word, open("/home/MOFdictionary/libs/mer/vocab/id2word","wb"))

In [14]:
def get_bio_tags(tags):
    
    bio_tags = []
    
    for i, tag in enumerate(tags):
        if i == 0: # B-
            
            if tag == 1: #pre
                bio_tag = 3 #B-pre
                
            elif tag == 2: #etc
                bio_tag = 5 #B-Etc
                
            elif tag == 3: #tar
                bio_tag = 1 #B-tar
                
            else:
                bio_tag = 0
        
        else:
            if tag == 1: #pre
                if tags[i-1] == 1:
                    bio_tag = 4 #I-pre
                else:
                    bio_tag = 3 #B-pre
                    
            elif tag == 2: #etc
                if tags[i-1] == 2:
                    bio_tag = 6 #I-etc
                else:
                    bio_tag = 5 #B-etc
                    
            elif tag == 3: #tar
                if tags[i-1] == 3:
                    bio_tag = 2 # I-tar
                else:
                    bio_tag = 1 # B-atr
                    
            else:
                bio_tag = 0
        
        bio_tags.append(bio_tag)
    
    return bio_tags         

In [15]:
sents = []
map_sents = []
bio_tags = []
for filepath in Path("/home/MOFdictionary/example/Dataset_MER/").glob("*.sav"):
    ners = pickle.load(open(filepath,"rb"))
    for ner in ners: #ners = [[(word,tag),(word,tag),,,],[],,,]
        words, tags = zip(*ner)
        sents.append(words)
        map_sents.append(list(map(lambda x: word2id[x] if x in word2id else 1, words)))
        bio_tags.append(get_bio_tags(tags)) 

In [16]:
max_length = 100
num_tags = 7

x_data = pad_sequences(map_sents, maxlen=max_length, padding="post")
y_data = pad_sequences(bio_tags, maxlen=max_length, padding="post")
#y_data = to_categorical(y_data, num_classes=num_tags) # make one-hot sequence

print(x_data.shape, y_data.shape)

(11173, 100) (11173, 100)


In [17]:
x_train, x_test_val, y_train, y_test_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=0.5, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(8938, 100) (1117, 100) (8938, 100) (1117, 100)


# 2. character Embedding

In [18]:
## making charlist
charlist = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ~†‡°·-,;.!?:’/\|_@#$%ˆ&*˜‘+-=()[]{}<>"

# char2idx
char2id = {}

char2id["PAD"] = 0
char2id["UNK"] = 1
for i, char in enumerate(charlist):
    char2id[char] = i+2
print(f"# of charlist = {len(char2id)}")

# of charlist = 101


In [19]:
#pickle.dump(char2id, open("/home/MOFdictionary/libs/mer/vocab/char2id","wb"))

In [20]:
def get_pad_map_char_sequences(x_data):
    x_data_char  = np.empty([x_data.shape[0], x_data.shape[1], 30])
    for i, sent in enumerate(x_data):

        map_sent = []

        for word in sent:

            map_word = list(map(lambda x : char2id[x] if x in char2id else 1, id2word[word]))
            if map_word == [100,43, 28, 31,101]: #<PAD> -> 0
                map_word = [0]
            map_sent.append(map_word)

        x_data_char[i] = pad_sequences(map_sent,padding="post",maxlen=30)
    x_data_char.shape
    return x_data_char

In [21]:
x_train_char = get_pad_map_char_sequences(x_train)
x_test_char = get_pad_map_char_sequences(x_test)
x_val_char = get_pad_map_char_sequences(x_val)
x_train_char.shape, x_test_char.shape, x_val_char.shape

((8938, 100, 30), (1117, 100, 30), (1118, 100, 30))

# Keras

In [45]:
from tf2crf import CRF, ModelWithCRFLoss
from tensorflow.keras.layers import Input, Embedding, Bidirectional, RNN, LSTMCell, TimeDistributed, Concatenate, Dense
from copy import deepcopy

In [23]:
def mask_scores(y_true, y_pred, lens_text):
    mask = tf.sequence_mask(lens_text, maxlen=100)
    m = tf.keras.metrics.Accuracy()
    acc = m(y_true, y_pred, sample_weight=mask)
    # precision, recall, f1 for multi-class
    
    masked_y_true = y_true[mask]
    masked_y_pred = y_pred[mask]
    accuracy = accuracy_score(masked_y_true, masked_y_pred)
    precision = precision_score(masked_y_true, masked_y_pred, average=None)
    recall = recall_score(masked_y_true, masked_y_pred, average=None)
    f1 = f1_score(masked_y_true, masked_y_pred, average=None)
    return acc.numpy(), precision, recall, f1

# CONLL scoring

In [24]:
load_model = tf.keras.models.load_model("./keras/bilstmcrf_char")

In [25]:
y_pred, _, lens_text, _ = load_model((x_test, x_test_char))
mask_scores(y_test, y_pred, lens_text)

(0.99343276,
 array([0.99736943, 0.89156627, 0.81818182, 0.94252874, 0.91428571,
        0.91286307, 0.85333333]),
 array([0.99690416, 0.92789969, 0.9       , 0.93447293, 0.92753623,
        0.90163934, 0.85333333]),
 array([0.99713674, 0.9093702 , 0.85714286, 0.93848355, 0.92086331,
        0.90721649, 0.85333333]))

In [214]:
def conll_score(masked_y_true, masked_y_pred, labels):
    
    # true
    cond = tf.logical_or(masked_y_true==labels[0], masked_y_true==labels[1])
    l = tf.where(cond, 1, 0)
    idx_list_true = np.where(l == 1)[0] 
    
    list_true = []
    remove_idx = []
    for i, idx in enumerate(idx_list_true):

        t = (idx, 0)
        
        if i <= len(idx_list_true) -2 and idx_list_true[i+1] - idx_list_true[i] == 1:
            t = (idx, 1)
            remove_idx.append(i+1)

            if i <= len(idx_list_true) -3 and idx_list_true[i+2] - idx_list_true[i] == 2:
                t = (idx, 2)
                remove_idx.append(i+2)
                
                if i <= len(idx_list_true) -4 and idx_list_true[i+2] - idx_list_true[i] == 3:
                    t = (idx, 3)
                    remove_idx.append(i+3)


        list_true.append(t)    
    final_true = np.delete(np.array(list_true), list(set(remove_idx)), axis=0)
    
    # pred
    cond = tf.logical_or(masked_y_pred==labels[0], masked_y_pred==labels[1])
    l = tf.where(cond, 1, 0)
    idx_list_pred = np.where(l == 1)[0]
    
    list_pred = []
    remove_idx = []
    for i, idx in enumerate(idx_list_pred):

        t = (idx, 0)

        if i <= len(idx_list_pred) -2 and idx_list_pred[i+1] - idx_list_pred[i] == 1:
            t = (idx, 1)
            remove_idx.append(i+1)

            if i <= len(idx_list_pred) -3 and idx_list_pred[i+2] - idx_list_pred[i] == 2:
                t = (idx, 2)
                remove_idx.append(i+2)
                
                if i <= len(idx_list_pred) -4 and idx_list_pred[i+2] - idx_list_pred[i] == 3:
                    t = (idx, 3)
                    remove_idx.append(i+3)

        list_pred.append(t)    
    final_pred = np.delete(np.array(list_pred), list(set(remove_idx)), axis=0)
    
    # precision
    ans = 0
    for pred in final_pred:
        for true in final_true:
            if all(true == pred):
                ans += 1
    precision = ans/len(final_pred)
    # recall
    ans = 0
    for true in final_true:
        for pred in final_pred:
            if all(true == pred):
                ans += 1
    recall = ans / len(final_true)
    
    f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

In [215]:
conll_score(masked_y_true, masked_y_pred, labels=[1,2])

(0.8905775075987842, 0.9272151898734177, 0.9085271317829458)

In [216]:
conll_score(masked_y_true, masked_y_pred, labels=[3,4])

(0.9337175792507204, 0.9230769230769231, 0.9283667621776505)

In [217]:
conll_score(masked_y_true, masked_y_pred, labels=[5,6])

(0.9053497942386831, 0.9016393442622951, 0.9034907597535935)