# 使用GRU_CRF训练数据

In [3]:
# 使用Google的colab调用文件时需要添加路径，将文件放入指定的位置才能识别到
# from google.colab import drive
# drive.mount('/content/drive') 

# import sys
# sys.path.append('/content/drive/MyDrive/data/')

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc
from gru_crf import BiGRUCRF
# 使用GPU
gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
tf.config.experimental.set_visible_devices(devices=gpus[0], device_type="GPU")

 The versions of TensorFlow you are currently using is 2.5.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


# 数据准备

In [2]:
# 加载数据
X = []
feaX = []
y = []

def data_read(line:str):
    lx, lfea, ly = eval(line)
    X.append(lx)
    feaX.append(lfea)
    y.append(ly)

In [3]:
%%time
# 读取数据并将提取出数据中的原始文本、文本特征及其对应的标签
# readline方法
# with open("label_old_only_work_project.txt", encoding='utf-8') as f:
#     line = f.readline()
#     while line:
#         data_read(line)
#         line = f.readline()
        
# readlines方法
with open("label_old_only_work_project.txt", encoding='utf-8') as f:
    for line in f.readlines():
        data_read(line)

# X[0], feaX[0], y[0]
len(feaX), len(y)
# X[:3]

CPU times: total: 3.67 s
Wall time: 3.7 s


(905, 905)

In [4]:
# # 数据提取
# X = [[
#         X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#     ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
# ]

# feaX = [[
#         feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#     ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
# ]

# y = [[
#         y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#     ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
# ]

# # 验证数据提取正确
# X_len = np.array([len(i) for i in X])
# feaX_len = np.array([len(i) for i in feaX])
# y_len = np.array([len(i) for i in y])
# print('验证数据提取正确：', max(X_len - feaX_len)==0, max(feaX_len - y_len)==0, max(X_len - y_len)==0)
# print('提取后数据数量：', len(feaX), len(y))

In [5]:
%%time
#X = [[
#        X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i]
#]
#
#feaX = [[
#        feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i]
#]
#
#y = [[
#        y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i] 
#]

X_new = []
feaX_new = []
y_new = []
for i in range(len(y)):
    if 'project-B' in y[i] or 'work-B' in y[i]:
        X_new.append([X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])
        feaX_new.append([feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])
        y_new.append([y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])

CPU times: total: 62.5 ms
Wall time: 52 ms


In [6]:
type(feaX_new)

list

In [7]:
# # get target encode dictionary
# tag2id = set()
# for i in y:
#     tag2id.update(i)
# tag2id = list(tag2id)
# tag2id.sort()
# tag2id = {k:v for v, k in enumerate(tag2id)}
# # 添加特征字符标签
# # tag2id['<start>'] = len(tag2id)
# tag2id['<PAD>'] = len(tag2id)

# # label encoding
# y_new = [[tag2id[token] for token in sequence] for sequence in y]

# tag2id #, y[0]

In [8]:
# 构建数据标签字典
tag2id = set()
for i in y_new:
    tag2id.update(i)
tag2id = list(tag2id)
tag2id.sort()
tag2id = {k:v for v, k in enumerate(tag2id)}
# 添加特征字符标签
# tag2id['<start>'] = len(tag2id)
tag2id['<PAD>'] = len(tag2id)

# label encoding
y_new = [[tag2id[token] for token in sequence] for sequence in y_new]

tag2id #, y[0]

{'project-B': 0, 'project-I': 1, 'work-B': 2, 'work-I': 3, '<PAD>': 4}

In [9]:
# # seqence padding
# max_len = max([len(i) for i in y])
# fea_dim = len(feaX[0][0])

# feaX = np.array(
#     [i + [[0] * fea_dim] * (max_len - len(i)) for i in feaX]
# )
# y = np.array(
#     [i + [tag2id['<PAD>']] * (max_len - len(i)) for i in y]
# )

# feaX.shape, y.shape

In [10]:
# seqence padding
max_len = max([len(i) for i in y_new])
fea_dim = len(feaX_new[0][0])

feaX_new = np.array(
    [i + [[0] * fea_dim] * (max_len - len(i)) for i in feaX_new]
)
y_new = np.array(
    [i + [tag2id['<PAD>']] * (max_len - len(i)) for i in y_new]
)

feaX_new.shape, y_new.shape

((891, 612, 24), (891, 612))

In [11]:
from gru_crf import *
model = BiGRUCRF(len(tag2id) - 1)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tf.keras.metrics.AUC(),
                       tf.keras.metrics.CategoricalAccuracy(),
                       tf.keras.metrics.Precision(),
                       tf.keras.metrics.Recall()])
model.fit(feaX_new, 
          y_new,
          batch_size = 40,
          epochs = 300,
          validation_split = 0.2,
          validation_freq = 20
         )
model.summary()

Epoch 1/300


OperatorNotAllowedInGraphError: in user code:

    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    D:\common_tools\common_tools\nlp\model\gru_crf_model\gru_crf.py:34 call  *
        X, mask = inputs
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:520 __iter__
        self._disallow_iteration()
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:513 _disallow_iteration
        self._disallow_when_autograph_enabled("iterating over `tf.Tensor`")
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:489 _disallow_when_autograph_enabled
        raise errors.OperatorNotAllowedInGraphError(

    OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.


In [11]:
# # 数据划分
# feaX_train, feaX_valid, y_train, y_valid = train_test_split(
#     feaX, y, test_size=0.2, random_state=42
# )

# feaX_train.shape, y_train.shape, feaX_valid.shape, y_valid.shape

((712, 612, 24), (712, 612), (179, 612, 24), (179, 612))

In [22]:
# 数据划分
feaX_train, feaX_valid, y_train, y_valid = train_test_split(feaX_new, 
                                                            y_new, 
                                                            test_size=0.2, 
                                                            random_state=39)

feaX_train.shape, y_train.shape, feaX_valid.shape, y_valid.shape

((712, 612, 24), (712, 612), (179, 612, 24), (179, 612))

In [54]:
# 数据集生成
def data_generater(X, y, batch_size, tag2id, is_mask=True):
    sample_num = X.shape[0]
    data = []
    if is_mask:
        for i in range(int(sample_num/batch_size) + 1):
            if i * batch_size < sample_num:
                data.append([
                    tf.constant(X[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(y[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(np.where(y[i * batch_size : (i + 1) * batch_size] == tag2id['<PAD>'], 0., 1.))
                ])
    else:
        for i in range(int(sample_num/batch_size) + 1):
            if i * batch_size < sample_num:
                data.append([
                    tf.constant(X[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(y[i * batch_size : (i + 1) * batch_size])
                ])
    return data


batch_size = 40
train_data = data_generater(feaX_train, y_train, batch_size, tag2id)
valid_data = [
    feaX_valid
    , y_valid
    , tf.constant(np.where(y_valid == tag2id['<PAD>'], 0., 1.))
]

# len(train_data), train_data[0], train_data[-1]
np.shape(train_data)

  result = asarray(a).shape


(18, 3)

In [None]:
from gru_crf import *
model = BiGRUCRF(len(tag2id) - 1)
# model.load_weights('./checkpoints/best_model')
model.train(train_data, valid_data, epochs=1000, verbose=0)

step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:57<00:00,  3.19s/it]


epoch  0   tra_loss: 62.274944   val_loss: 53.83756   val_accuracy: 0.64296913   val_precision: 0.17974022   val_recall: 0.2507615   val_F1score: 0.2093925566363784 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:54<00:00,  3.05s/it]


epoch  1   tra_loss: 51.34507   val_loss: 44.641464   val_accuracy: 0.7042248   val_precision: 0.391568   val_recall: 0.37051392   val_F1score: 0.38075012917492235 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:54<00:00,  3.04s/it]


epoch  2   tra_loss: 42.297287   val_loss: 36.497276   val_accuracy: 0.7353556   val_precision: 0.38637492   val_recall: 0.42012668   val_F1score: 0.40254453524305295 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:54<00:00,  3.01s/it]


epoch  3   tra_loss: 36.006386   val_loss: 32.449966   val_accuracy: 0.75344884   val_precision: 0.8438911   val_recall: 0.48232883   val_F1score: 0.6138242927657531 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:54<00:00,  3.02s/it]


epoch  4   tra_loss: 32.099884   val_loss: 27.772625   val_accuracy: 0.7700423   val_precision: 0.8410914   val_recall: 0.55495906   val_F1score: 0.6687026067245783 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.08s/it]


epoch  5   tra_loss: 28.967339   val_loss: 25.524666   val_accuracy: 0.7841485   val_precision: 0.8448491   val_recall: 0.63619757   val_F1score: 0.7258258048293311 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:54<00:00,  3.05s/it]


epoch  6   tra_loss: 26.753773   val_loss: 24.080782   val_accuracy: 0.7952321   val_precision: 0.82785743   val_recall: 0.7110343   val_F1score: 0.7650115919267042 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.06s/it]


epoch  7   tra_loss: 25.26316   val_loss: 22.468882   val_accuracy: 0.80474997   val_precision: 0.82931817   val_recall: 0.7545549   val_F1score: 0.7901720593558325 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.08s/it]


epoch  8   tra_loss: 23.761389   val_loss: 21.366882   val_accuracy: 0.81295395   val_precision: 0.8386726   val_recall: 0.7823441   val_F1score: 0.809529635785032 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.09s/it]


epoch  9   tra_loss: 22.376078   val_loss: 19.868315   val_accuracy: 0.8199091   val_precision: 0.8456394   val_recall: 0.79831225   val_F1score: 0.8212945845094209 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.07s/it]


epoch  10   tra_loss: 20.695417   val_loss: 18.677567   val_accuracy: 0.82651883   val_precision: 0.8574819   val_recall: 0.824233   val_F1score: 0.8405287669844161 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.08s/it]


epoch  11   tra_loss: 19.60167   val_loss: 17.22255   val_accuracy: 0.832445   val_precision: 0.87176526   val_recall: 0.83716786   val_F1score: 0.8541163448881256 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.11s/it]


epoch  12   tra_loss: 17.936087   val_loss: 15.935725   val_accuracy: 0.83821315   val_precision: 0.8890503   val_recall: 0.85396254   val_F1score: 0.8711532254039605 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:56<00:00,  3.12s/it]


epoch  13   tra_loss: 16.709751   val_loss: 14.967337   val_accuracy: 0.843146   val_precision: 0.8969526   val_recall: 0.85916793   val_F1score: 0.8776537934609059 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.09s/it]


epoch  14   tra_loss: 15.454198   val_loss: 13.88638   val_accuracy: 0.8480274   val_precision: 0.9031533   val_recall: 0.87367517   val_F1score: 0.8881696784684668 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.10s/it]


epoch  15   tra_loss: 14.595917   val_loss: 12.216866   val_accuracy: 0.8527542   val_precision: 0.9209064   val_recall: 0.8691614   val_F1score: 0.8942860068111811 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.10s/it]


epoch  16   tra_loss: 13.203693   val_loss: 10.891265   val_accuracy: 0.8576718   val_precision: 0.92712116   val_recall: 0.8892587   val_F1score: 0.9077953302302064 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.09s/it]


epoch  17   tra_loss: 12.049265   val_loss: 9.6383095   val_accuracy: 0.8625133   val_precision: 0.93149006   val_recall: 0.90432173   val_F1score: 0.9177048338769275 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.11s/it]


epoch  18   tra_loss: 11.051776   val_loss: 9.048557   val_accuracy: 0.8671298   val_precision: 0.9352381   val_recall: 0.9182775   val_F1score: 0.9266802132996589 



step: 100%|████████████████████████████████████████████████████████████████████████████| 18/18 [00:55<00:00,  3.10s/it]


epoch  19   tra_loss: 10.408105   val_loss: 9.017732   val_accuracy: 0.87131995   val_precision: 0.9345613   val_recall: 0.92800134   val_F1score: 0.9312697760583517 



step:  94%|███████████████████████████████████████████████████████████████████████▊    | 17/18 [00:51<00:03,  3.02s/it]

In [50]:
# feaX_new = np.array([i for i in feaX_new])
# y_new = np.array([i for i in y_new])


Epoch 1/300


OperatorNotAllowedInGraphError: in user code:

    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    D:\common_tools\common_tools\nlp\model\gru_crf_model\gru_crf.py:34 call  *
        X, mask = inputs
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:520 __iter__
        self._disallow_iteration()
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:513 _disallow_iteration
        self._disallow_when_autograph_enabled("iterating over `tf.Tensor`")
    C:\Users\86183\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:489 _disallow_when_autograph_enabled
        raise errors.OperatorNotAllowedInGraphError(

    OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.
