In [1]:
# 读入数据
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from model import BiLSTM_CRF
import torch.optim as optim
import torch
from train import train, test
from metric import f1_score

START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 128
HIDDEN_DIM = 64

# 序列字典 
tag2id = {
    "O": 0, 
    "B-NAME": 1, 
    "I-NAME": 2,
    "B-NOTIONAL": 4, 
    "I-NOTIONAL": 5,
    "B-TICKER": 6, 
    "I-TICKER": 7,
    START_TAG: 8, 
    STOP_TAG: 9,
    '[CLS]': 10,
    '[SEP]': 11
}

def extract(raw_data):
    # 改造数据
    # BIO Tagging
    all_data = []
    x = []
    y = []
    for d in raw_data:
        text = d['text'].split()
        text = [_t.lower() for _t in text]
        sen_label = [] #['o'] * len(text)
        labels = d['label']
        char_label = ['O'] * len(d['text'])
        for la in labels:
            start_index = la[1][0]
            end_index = la[1][1]
            char_label[start_index] = 'B-' + la[0]
            for i in range(start_index + 1, end_index):
                char_label[i] = 'I-' + la[0]
        # print(char_label)
        sen_label.append(char_label[0])
        for j in range (0,len(d['text'])):
            if d['text'][j] == ' ' and j != len(d['text']) - 1 and j != 0:
                # print(j)
                sen_label.append(char_label[j+1])

        x.append(text)
        y.append(sen_label)
        tri = (text,sen_label)
        # print(tri)
        all_data.append(tri)

    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test,y_test,test_size=0.5,random_state=0)

    print("x_train is:"); print(len(x_train))
    print("y_train is:"); print(len(y_train))
    print("x_dev is:");  print(len(x_test))
    print("y_dev is:");  print(len(y_test))
    print("x_test is:");  print(len(x_test))
    print("y_test is:");  print(len(y_test))
    """
    word_to_ix = {} # 词表
    for sentence, tags in all_data:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    """
    
    word_to_ix = {}
    for sentence in x_train:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    
    np.save('vocab.npy', word_to_ix)
    return x_train, y_train, x_dev, y_dev, x_test, y_test, word_to_ix

In [2]:
from data_loader import NERDataset

f = open('data.json') 
# 数据部分
data = json.load(f)
print('--------Data load--------')
device = 'cuda' if torch.cuda.is_available() else 'cpu' 

print(device)
x_train, y_train, x_dev, y_dev, x_test, y_test, word2id = extract(data)

unk_id = len(word2id)
print(unk_id)
word2id['<unk1>'] = unk_id # 未知词

train_dataset = NERDataset(x_train, y_train, word2id, tag2id, unk_id)
dev_dataset = NERDataset(x_dev, y_dev, word2id, tag2id, unk_id)
test_dataset = NERDataset(x_test, y_test, word2id, tag2id, unk_id)

train_loader = DataLoader(train_dataset, batch_size=32,
                            shuffle=True, collate_fn=train_dataset.collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=32,
                            shuffle=True, collate_fn=train_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32,
                        shuffle=True, collate_fn=test_dataset.collate_fn)

--------Data load--------
cuda
x_train is:
2100
y_train is:
2100
x_dev is:
450
y_dev is:
450
x_test is:
450
y_test is:
450
[([0, 1, 2, 3, 4, 5], [0, 1, 2, 0, 4, 6])]
-------- Process Done! --------
[([26, 27, 3238, 444, 11], [0, 0, 4, 6, 0])]
-------- Process Done! --------
[([23, 3238, 929, 6], [0, 4, 6, 0])]
-------- Process Done! --------


In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # 下面老是报错 shape 不一致

model = BiLSTM_CRF(
    embedding_dim = EMBEDDING_DIM, # 128
    hidden_dim = HIDDEN_DIM, # 64
    vocab_size = len(word2id),
    tag_to_ix = tag2id 
)

print("--------model define--------")
model.to(device)

optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)
model

--------model define--------


BiLSTM_CRF(
  (embedding): Embedding(3239, 128)
  (lstm): LSTM(128, 32, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (hidden2tag): Linear(in_features=64, out_features=11, bias=True)
  (crf): CRF(num_tags=11)
)

In [4]:
train(train_loader, dev_loader, word2id, tag2id, model, optimizer, device, scheduler )
print("--------train over--------")

  score = torch.where(mask[i].unsqueeze(1), next_score, score)
100%|██████████| 66/66 [00:03<00:00, 18.43it/s]


epoch: 1, train loss: 66.59553400675456
epoch: 1, f1 score: 0.9621621621621622, dev loss: 10.199355061848959
--------Save best model!--------


100%|██████████| 66/66 [00:04<00:00, 14.75it/s]


epoch: 2, train loss: 9.225349368471088
epoch: 2, f1 score: 0.994059405940594, dev loss: 4.851529947916666
--------Save best model!--------


100%|██████████| 66/66 [00:04<00:00, 15.68it/s]


epoch: 3, train loss: 5.7612910559683135
epoch: 3, f1 score: 0.9955467590301831, dev loss: 4.149214426676433
--------Save best model!--------


100%|██████████| 66/66 [00:04<00:00, 15.26it/s]


epoch: 4, train loss: 4.302227540449663
epoch: 4, f1 score: 0.9965363681345869, dev loss: 3.356384023030599
--------Save best model!--------


100%|██████████| 66/66 [00:04<00:00, 14.82it/s]


epoch: 5, train loss: 3.7093129591508345
epoch: 5, f1 score: 0.9965363681345869, dev loss: 3.1289708455403646


100%|██████████| 66/66 [00:04<00:00, 14.36it/s]


epoch: 6, train loss: 3.053154974272757
epoch: 6, f1 score: 0.9965363681345869, dev loss: 3.2501347859700522


100%|██████████| 66/66 [00:04<00:00, 15.55it/s]


epoch: 7, train loss: 2.561188567768444
epoch: 7, f1 score: 0.9930555555555556, dev loss: 3.6088437398274738


100%|██████████| 66/66 [00:04<00:00, 15.62it/s]


epoch: 8, train loss: 2.08862879782012
epoch: 8, f1 score: 0.9965363681345869, dev loss: 2.5972941080729166


100%|██████████| 66/66 [00:04<00:00, 15.48it/s]


epoch: 9, train loss: 1.7705114538019353
epoch: 9, f1 score: 0.9965363681345869, dev loss: 2.715096028645833


100%|██████████| 66/66 [00:04<00:00, 15.52it/s]


epoch: 10, train loss: 1.5808453704371597
epoch: 10, f1 score: 0.9965363681345869, dev loss: 2.50779291788737


100%|██████████| 66/66 [00:04<00:00, 15.19it/s]


epoch: 11, train loss: 1.3388647310661548
epoch: 11, f1 score: 0.9890329012961117, dev loss: 3.676182810465495


100%|██████████| 66/66 [00:04<00:00, 15.79it/s]


epoch: 12, train loss: 1.2056514855587122
epoch: 12, f1 score: 0.994047619047619, dev loss: 2.8602032979329426


100%|██████████| 66/66 [00:04<00:00, 15.66it/s]


epoch: 13, train loss: 0.9682432376977169
epoch: 13, f1 score: 0.9950445986124876, dev loss: 2.734649149576823


100%|██████████| 66/66 [00:04<00:00, 14.41it/s]


epoch: 14, train loss: 0.9008352539756082
epoch: 14, f1 score: 0.9910447761194029, dev loss: 3.0294520060221353
Best val f1: 0.9965363681345869
Training Finished!
--------train over--------


In [6]:

results = test(test_loader, word2id, tag2id, model, device)
df = pd.DataFrame.from_dict(results['f1_labels'], orient='index',columns=['f1_score'])
df = df.reset_index().rename(columns = {'index':'label'})

print('最终测试集f1分数： {}'.format(results['f1']))
print("各标签识别情况")
df

最终测试集f1分数： 0.9955686853766617
各标签识别情况


Unnamed: 0,label,f1_score
0,NAME,0.995816
1,NOTIONAL,0.992161
2,TICKER,0.998888
