# 使用GRU_CRF训练数据

In [3]:
# 使用Google的colab调用文件时需要添加路径，将文件放入指定的位置才能识别到
# from google.colab import drive
# drive.mount('/content/drive') 

# import sys
# sys.path.append('/content/drive/MyDrive/data/')

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc
from gru_crf import BiGRUCRF
# 使用GPU
gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
tf.config.experimental.set_visible_devices(devices=gpus[0], device_type="GPU")

 The versions of TensorFlow you are currently using is 2.5.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


# 数据准备

In [2]:
# 加载数据
X = []
feaX = []
y = []

def data_read(line:str):
    lx, lfea, ly = eval(line)
    X.append(lx)
    feaX.append(lfea)
    y.append(ly)

In [3]:
%%time
# 读取数据并将提取出数据中的原始文本、文本特征及其对应的标签
# readline方法
# with open("label_old_only_work_project.txt", encoding='utf-8') as f:
#     line = f.readline()
#     while line:
#         data_read(line)
#         line = f.readline()
        
# readlines方法
with open("label_old_only_work_project.txt", encoding='utf-8') as f:
    for line in f.readlines():
        data_read(line)

# X[0], feaX[0], y[0]
len(feaX), len(y)
# X[:3]

CPU times: total: 3.66 s
Wall time: 3.63 s


(905, 905)

In [4]:
# # 数据提取
# X = [[
#         X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#     ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
# ]

# feaX = [[
#         feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#     ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
# ]

# y = [[
#         y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#     ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
# ]

# # 验证数据提取正确
# X_len = np.array([len(i) for i in X])
# feaX_len = np.array([len(i) for i in feaX])
# y_len = np.array([len(i) for i in y])
# print('验证数据提取正确：', max(X_len - feaX_len)==0, max(feaX_len - y_len)==0, max(X_len - y_len)==0)
# print('提取后数据数量：', len(feaX), len(y))

In [5]:
%%time
#X = [[
#        X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i]
#]
#
#feaX = [[
#        feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i]
#]
#
#y = [[
#        y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i] 
#]

X_new = []
feaX_new = []
y_new = []
for i in range(len(y)):
    if 'project-B' in y[i] or 'work-B' in y[i]:
        X_new.append([X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])
        feaX_new.append([feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])
        y_new.append([y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])

CPU times: total: 62.5 ms
Wall time: 52 ms


In [6]:
type(feaX_new)

list

In [7]:
# # get target encode dictionary
# tag2id = set()
# for i in y:
#     tag2id.update(i)
# tag2id = list(tag2id)
# tag2id.sort()
# tag2id = {k:v for v, k in enumerate(tag2id)}
# # 添加特征字符标签
# # tag2id['<start>'] = len(tag2id)
# tag2id['<PAD>'] = len(tag2id)

# # label encoding
# y_new = [[tag2id[token] for token in sequence] for sequence in y]

# tag2id #, y[0]

In [4]:
# 构建数据标签字典
tag2id = set()
for i in y:
    tag2id.update(i)
tag2id = list(tag2id)
tag2id.sort()
tag2id = {k:v+1 for v, k in enumerate(tag2id)}
# 添加特征字符标签
# tag2id['<start>'] = len(tag2id)
tag2id['<PAD>'] = 0

# label encoding
y_new = [[tag2id[token] for token in sequence] for sequence in y]

tag2id #, y[0]

{'O': 1, 'project-B': 2, 'project-I': 3, 'work-B': 4, 'work-I': 5, '<PAD>': 0}

In [9]:
# # seqence padding
# max_len = max([len(i) for i in y])
# fea_dim = len(feaX[0][0])

# feaX = np.array(
#     [i + [[0] * fea_dim] * (max_len - len(i)) for i in feaX]
# )
# y = np.array(
#     [i + [tag2id['<PAD>']] * (max_len - len(i)) for i in y]
# )

# feaX.shape, y.shape

In [5]:
# seqence padding
max_len = max([len(i) for i in y])
fea_dim = len(feaX[0][0])

feaX = np.array(
    [i + [[0] * fea_dim] * (max_len - len(i)) for i in feaX]
)
y = np.array(
    [i + [tag2id['<PAD>']] * (max_len - len(i)) for i in y]
)

feaX.shape, y.shape

((905, 1123, 24), (905, 1123))

In [8]:
from gru_crf import *
model = BiGRUCRF(len(tag2id) - 1)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tf.keras.metrics.AUC(),
                       tf.keras.metrics.CategoricalAccuracy(),
                       tf.keras.metrics.Precision(),
                       tf.keras.metrics.Recall()])
model.fit(feaX_new, 
          y_new,
          batch_size = 40,
          epochs = 300,
          validation_split = 0.2,
          validation_freq = 20
         )
model.summary()

Epoch 1/300


OperatorNotAllowedInGraphError: in user code:

    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    D:\common_tools\common_tools\nlp\model\gru_crf_model\gru_crf.py:34 call  *
        X, mask = inputs
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:520 __iter__
        self._disallow_iteration()
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:513 _disallow_iteration
        self._disallow_when_autograph_enabled("iterating over `tf.Tensor`")
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:489 _disallow_when_autograph_enabled
        raise errors.OperatorNotAllowedInGraphError(

    OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.


In [11]:
# # 数据划分
# feaX_train, feaX_valid, y_train, y_valid = train_test_split(
#     feaX, y, test_size=0.2, random_state=42
# )

# feaX_train.shape, y_train.shape, feaX_valid.shape, y_valid.shape

((712, 612, 24), (712, 612), (179, 612, 24), (179, 612))

In [6]:
# 数据划分
feaX_train, feaX_valid, y_train, y_valid = train_test_split(feaX, 
                                                            y, 
                                                            test_size=0.2, 
                                                            random_state=39)

feaX_train.shape, y_train.shape, feaX_valid.shape, y_valid.shape

((724, 1123, 24), (724, 1123), (181, 1123, 24), (181, 1123))

In [7]:
# 数据集生成
def data_generater(X, y, batch_size, tag2id, is_mask=True):
    sample_num = X.shape[0]
    data = []
    if is_mask:
        for i in range(int(sample_num/batch_size) + 1):
            if i * batch_size < sample_num:
                data.append([
                    tf.constant(X[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(y[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(np.where(y[i * batch_size : (i + 1) * batch_size] == tag2id['<PAD>'], 0., 1.))
                ])
    else:
        for i in range(int(sample_num/batch_size) + 1):
            if i * batch_size < sample_num:
                data.append([
                    tf.constant(X[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(y[i * batch_size : (i + 1) * batch_size])
                ])
    return data


batch_size = 40
train_data = data_generater(feaX_train, y_train, batch_size, tag2id)
valid_data = [
    feaX_valid
    , y_valid
    , tf.constant(np.where(y_valid == tag2id['<PAD>'], 0., 1.))
]

# len(train_data), train_data[0], train_data[-1]
np.shape(train_data)

  , tf.constant(np.where(y[i * batch_size : (i + 1) * batch_size] == tag2id['<PAD>'], 0., 1.))
  , tf.constant(np.where(y_valid == tag2id['<PAD>'], 0., 1.))
  result = asarray(a).shape


(19, 3)

In [8]:
from gru_crf import *
model = BiGRUCRF(len(tag2id) - 1)
# model.load_weights('./checkpoints/best_model')
model.train(train_data, valid_data, epochs=70, verbose=0)

step:   0%|                                                                                     | 0/19 [00:00<?, ?it/s]


InvalidArgumentError: slice index 1 of dimension 0 out of bounds. [Op:StridedSlice] name: bi_grucrf/bidirectional/forward_gru/strided_slice/

In [50]:
# feaX_new = np.array([i for i in feaX_new])
# y_new = np.array([i for i in y_new])


Epoch 1/300


OperatorNotAllowedInGraphError: in user code:

    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    D:\common_tools\common_tools\nlp\model\gru_crf_model\gru_crf.py:34 call  *
        X, mask = inputs
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:520 __iter__
        self._disallow_iteration()
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:513 _disallow_iteration
        self._disallow_when_autograph_enabled("iterating over `tf.Tensor`")
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:489 _disallow_when_autograph_enabled
        raise errors.OperatorNotAllowedInGraphError(

    OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.
