# 使用GRU_CRF训练数据

In [1]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc
from gru_crf import LinerCRF

In [2]:
# 使用Google的colab调用文件时需要添加路径，将文件放入指定的位置才能识别到
# from google.colab import drive
# drive.mount('/content/drive') 
# sys.path.append('/content/drive/MyDrive/data/')

In [3]:
# 使用GPU
# gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
# tf.config.experimental.set_visible_devices(devices=gpus[0], device_type="GPU")

# 数据准备

In [2]:
# 加载数据
X = []
feaX = []
y = []

def data_read(line:str):
    lx, lfea, ly = eval(line)
    X.append(lx)
    feaX.append(lfea)
    y.append(ly)

In [3]:
%%time
# 读取数据并将提取出数据中的原始文本、文本特征及其对应的标签
# readline方法
# with open("label_old_only_work_project.txt", encoding='utf-8') as f:
#     line = f.readline()
#     while line:
#         data_read(line)
#         line = f.readline()
        
# readlines方法
with open("label_old_only_work_project.txt", encoding='utf-8') as f:
    for line in f.readlines():
        data_read(line)

# X[0], feaX[0], y[0]
len(feaX), len(y)
# X[:3]

CPU times: user 3.74 s, sys: 131 ms, total: 3.88 s
Wall time: 3.89 s


(905, 905)

In [4]:
# 数据提取
X = [[
        X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
    ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
]

feaX = [[
        feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
    ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
]

y = [[
        y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
    ] for i in range(len(y)) if 'project-B' in y[i] or 'project-I' in y[i] or 'work-B' in y[i] or 'work-I' in y[i]
]

# 验证数据提取正确
X_len = np.array([len(i) for i in X])
feaX_len = np.array([len(i) for i in feaX])
y_len = np.array([len(i) for i in y])
print('验证数据提取正确：', max(X_len - feaX_len)==0, max(feaX_len - y_len)==0, max(X_len - y_len)==0)
print('提取后数据数量：', len(feaX), len(y))


验证数据提取正确： True True True
提取后数据数量： 891 891


In [6]:
%%time
#X = [[
#        X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i]
#]
#
#feaX = [[
#        feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i]
#]
#
#y = [[
#        y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'
#    ] for i in range(len(y)) if 'project-B' in y[i] or 'work-B' in y[i] 
#]

X_new = []
feaX_new = []
y_new = []
for i in range(len(y)):
    if 'project-B' in y[i] or 'work-B' in y[i]:
        X_new.append([X[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])
        feaX_new.append([feaX[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])
        y_new.append([y[i][j] for j in range(len(y[i])) if y[i][j] != 'O'])

CPU times: user 68 ms, sys: 3.42 ms, total: 71.4 ms
Wall time: 70.2 ms


In [5]:
# get target encode dictionary
tag2id = set()
for i in y:
    tag2id.update(i)
tag2id = list(tag2id)
tag2id.sort()
tag2id = {k:v for v, k in enumerate(tag2id)}
# 添加特征字符标签
# tag2id['<start>'] = len(tag2id)
tag2id['<PAD>'] = len(tag2id)

# label encoding
y = [[tag2id[token] for token in sequence] for sequence in y]

tag2id #, y[0]

{'project-B': 0, 'project-I': 1, 'work-B': 2, 'work-I': 3, '<PAD>': 4}

In [7]:
# 构建数据标签字典
tag2id = set()
for i in y_new:
    tag2id.update(i)
tag2id = list(tag2id)
tag2id.sort()
tag2id = {k:v for v, k in enumerate(tag2id)}
# 添加特征字符标签
# tag2id['<start>'] = len(tag2id)
tag2id['<PAD>'] = len(tag2id)

# label encoding
y_new = [[tag2id[token] for token in sequence] for sequence in y_new]

tag2id #, y[0]

{'project-B': 0, 'project-I': 1, 'work-B': 2, 'work-I': 3, '<PAD>': 4}

In [6]:
# seqence padding
max_len = max([len(i) for i in y])
fea_dim = len(feaX[0][0])

feaX = np.array(
    [i + [[0] * fea_dim] * (max_len - len(i)) for i in feaX]
)
y = np.array(
    [i + [tag2id['<PAD>']] * (max_len - len(i)) for i in y]
)

feaX.shape, y.shape

((891, 612, 24), (891, 612))

In [8]:
# seqence padding
max_len = max([len(i) for i in y_new])
fea_dim = len(feaX_new[0][0])

feaX_new = np.array(
    [i + [[0] * fea_dim] * (max_len - len(i)) for i in feaX_new]
)
y_new = np.array(
    [i + [tag2id['<PAD>']] * (max_len - len(i)) for i in y_new]
)

feaX_new.shape, y_new.shape

((891, 612, 24), (891, 612))

In [7]:
# 数据划分
feaX_train, feaX_valid, y_train, y_valid = train_test_split(
    feaX, y, test_size=0.2, random_state=42
)

feaX_train.shape, y_train.shape, feaX_valid.shape, y_valid.shape

((712, 612, 24), (712, 612), (179, 612, 24), (179, 612))

In [9]:
# 数据划分
feaX_train, feaX_valid, y_train, y_valid = train_test_split(feaX_new, 
                                                            y_new, 
                                                            test_size=0.2, 
                                                            random_state=39)

feaX_train.shape, y_train.shape, feaX_valid.shape, y_valid.shape

((712, 612, 24), (712, 612), (179, 612, 24), (179, 612))

In [8]:
# 数据集生成
def data_generater(X, y, batch_size, tag2id, is_mask=True):
    sample_num = X.shape[0]
    data = []
    if is_mask:
        for i in range(int(sample_num/batch_size) + 1):
            if i * batch_size < sample_num:
                data.append([
                    tf.constant(X[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(y[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(np.where(y[i * batch_size : (i + 1) * batch_size] == tag2id['<PAD>'], 0., 1.))
                ])
    else:
        for i in range(int(sample_num/batch_size) + 1):
            if i * batch_size < sample_num:
                data.append([
                    tf.constant(X[i * batch_size : (i + 1) * batch_size])
                    , tf.constant(y[i * batch_size : (i + 1) * batch_size])
                ])
    return data


batch_size = 40
train_data = data_generater(feaX_train, y_train, batch_size, tag2id)
valid_data = [
    feaX_valid
    , y_valid
    , tf.constant(np.where(y_valid == tag2id['<PAD>'], 0., 1.))
]

len(train_data), train_data[0], train_data[-1]

2022-05-26 20:24:33.127322: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(18,
 [<tf.Tensor: shape=(40, 612, 24), dtype=float64, numpy=
  array([[[0.        , 0.        , 0.        , ..., 1.        ,
           0.17241379, 0.07692308],
          [0.        , 0.        , 0.        , ..., 1.        ,
           0.17931034, 0.15384615],
          [0.        , 0.        , 0.        , ..., 2.        ,
           0.1862069 , 0.23076923],
          ...,
          [0.        , 0.        , 0.        , ..., 0.        ,
           0.        , 0.        ],
          [0.        , 0.        , 0.        , ..., 0.        ,
           0.        , 0.        ],
          [0.        , 0.        , 0.        , ..., 0.        ,
           0.        , 0.        ]],
  
         [[0.        , 0.        , 0.        , ..., 1.        ,
           0.12621359, 0.09090909],
          [0.        , 0.        , 0.        , ..., 1.        ,
           0.13106796, 0.18181818],
          [0.        , 0.        , 0.        , ..., 1.        ,
           0.13592233, 0.27272727],
          ...,
    

In [10]:
from gru_crf import *
model = LinerCRF(len(tag2id) - 1)
# model.load_weights('./checkpoints/best_model')
model.train(train_data, valid_data, epochs=1000, verbose=0)

step:   0%|                                              | 0/18 [00:02<?, ?it/s]


InvalidArgumentError: indices[24479] = 97920 is not in [0, 97920) [Op:GatherV2]