In [1]:
import torch
import torch.nn as nn
import time
import BiLSTM_CRF
from utils import *
from NER_dataset import *
from tag_mapping import *
from word_mapping import *
from tqdm import tqdm

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
# 数据准备与预处理
train_data_path = 'data/train.csv'
valid_data_path = 'data/dev.csv'

train_data = NER_dataset(train_data_path)
valid_data = NER_dataset(valid_data_path)

TagMapping = tag_mapping()
WordMapping = word_mapping(train_data)

train_data.get_tag_mapping(TagMapping.encode_mapping)
train_data.get_word_mapping(WordMapping.encode_mapping)
valid_data.get_tag_mapping(TagMapping.encode_mapping)
valid_data.get_word_mapping(WordMapping.encode_mapping)

tag_size = TagMapping.num_tag
vocab_size = WordMapping.num_word

In [3]:
# 模型构建与训练
dropout = 0.5
embed_size = 256
hidden_size = 256
batch_size = 32
max_epoch = 20
lr = 0.001
clip_max_norm = 5.0

model_save_path = './model/model_1.pth'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTM_CRF.BiLSTMCRF(tag_size, vocab_size, dropout, embed_size, hidden_size).to(device)
for name, param in model.named_parameters():
    if 'weight' in name:
        nn.init.normal_(param.data, 0, 0.01)
    else:
        nn.init.constant_(param.data, 0)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

train_losses = []
valid_losses = []
print('start training...')
for epoch in range(max_epoch):
    num_iter = 0
    for sentences, tags in tqdm(batch_iter(train_data, batch_size=batch_size)):
        num_iter += 1
        sentences, sent_lengths = pad(sentences, vocab_size - 1, device)
        tags, _ = pad(tags, tag_size - 1, device)

        optimizer.zero_grad()
        batch_loss = model(sentences, tags, sent_lengths)  # shape: (b,)
        loss = batch_loss.mean()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_max_norm)
        optimizer.step()

        if num_iter % 100 == 0:
            print('Epoch: %d, Iter: %d, Loss: %.4f' % (epoch, num_iter, loss.item()))
    
    train_losses.append(loss.item())
    valid_losses.append(compute_valid_loss(model, valid_data, batch_size, device, vocab_size, tag_size))
    if epoch % 2 == 0:
        model.save(model_save_path + f'_{epoch}')
    

model.save(model_save_path)
plot_losses(train_losses, valid_losses, 'losses_1.png')

start training...


48it [00:07,  6.05it/s]


KeyboardInterrupt: 

In [None]:
output_path = 'data/dev_pred_1.csv'

with open(output_path, 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['word', 'expected'])
    model.eval()
    with torch.no_grad():
        for sentences, tags in valid_data:
            padded_sentences, sent_lengths = pad([sentences], vocab_size - 1, device)
            pred_tags = model.predict(padded_sentences, sent_lengths)
            sent = WordMapping.decode(sentences)
            pred_tags = TagMapping.decode(pred_tags[0])
            for word, pred_tag in zip(sent, pred_tags):
                writer.writerow([word, pred_tag])

show_confusion_matrix(valid_data_path, output_path, tag_size, TagMapping, 'confusion_matrix_1')

In [None]:
dropout = 0.5
embed_size = 100
hidden_size = 256
batch_size = 32
max_epoch = 20
lr = 0.001
clip_max_norm = 5.0

model_save_path = f'./model/model_2.pth'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTM_CRF.BiLSTMCRF(tag_size, vocab_size, dropout, embed_size, hidden_size).to(device)
for name, param in model.named_parameters():
    if 'weight' in name:
        nn.init.normal_(param.data, 0, 0.01)
    else:
        nn.init.constant_(param.data, 0)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

train_losses = []
valid_losses = []
print('start training...')
for epoch in range(max_epoch):
    num_iter = 0
    for sentences, tags in tqdm(batch_iter(train_data, batch_size=batch_size)):
        num_iter += 1
        sentences, sent_lengths = pad(sentences, vocab_size - 1, device)
        tags, _ = pad(tags, tag_size - 1, device)

        optimizer.zero_grad()
        batch_loss = model(sentences, tags, sent_lengths)  # shape: (b,)
        loss = batch_loss.mean()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_max_norm)
        optimizer.step()

        if num_iter % 100 == 0:
            print('Epoch: %d, Iter: %d, Loss: %.4f' % (epoch, num_iter, loss.item()))
    
    train_losses.append(loss.item())
    valid_losses.append(compute_valid_loss(model, valid_data, batch_size, device, vocab_size, tag_size))
    if epoch % 2 == 0:
        model.save(model_save_path + f'_{epoch}')
    

model.save(model_save_path)
plot_losses(train_losses, valid_losses, 'losses_2.png')

output_path = 'data/dev_pred_2.csv'

with open(output_path, 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['word', 'expected'])
    model.eval()
    with torch.no_grad():
        for sentences, tags in valid_data:
            padded_sentences, sent_lengths = pad([sentences], vocab_size - 1, device)
            pred_tags = model.predict(padded_sentences, sent_lengths)
            sent = WordMapping.decode(sentences)
            pred_tags = TagMapping.decode(pred_tags[0])
            for word, pred_tag in zip(sent, pred_tags):
                writer.writerow([word, pred_tag])

show_confusion_matrix(valid_data_path, output_path, tag_size, TagMapping, 'confusion_matrix_2')

In [None]:
dropout = 0.5
embed_size = 256
hidden_size = 512
batch_size = 32
max_epoch = 20
lr = 0.001
clip_max_norm = 5.0

model_save_path = f'./model/model_3.pth'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTM_CRF.BiLSTMCRF(tag_size, vocab_size, dropout, embed_size, hidden_size).to(device)
for name, param in model.named_parameters():
    if 'weight' in name:
        nn.init.normal_(param.data, 0, 0.01)
    else:
        nn.init.constant_(param.data, 0)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

train_losses = []
valid_losses = []
print('start training...')
for epoch in range(max_epoch):
    num_iter = 0
    for sentences, tags in tqdm(batch_iter(train_data, batch_size=batch_size)):
        num_iter += 1
        sentences, sent_lengths = pad(sentences, vocab_size - 1, device)
        tags, _ = pad(tags, tag_size - 1, device)

        optimizer.zero_grad()
        batch_loss = model(sentences, tags, sent_lengths)  # shape: (b,)
        loss = batch_loss.mean()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_max_norm)
        optimizer.step()

        if num_iter % 100 == 0:
            print('Epoch: %d, Iter: %d, Loss: %.4f' % (epoch, num_iter, loss.item()))
    
    train_losses.append(loss.item())
    valid_losses.append(compute_valid_loss(model, valid_data, batch_size, device, vocab_size, tag_size))
    if epoch % 2 == 0:
        model.save(model_save_path + f'_{epoch}')
    

model.save(model_save_path)
plot_losses(train_losses, valid_losses, 'losses_3.png')

output_path = 'data/dev_pred_3.csv'

with open(output_path, 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['word', 'expected'])
    model.eval()
    with torch.no_grad():
        for sentences, tags in valid_data:
            padded_sentences, sent_lengths = pad([sentences], vocab_size - 1, device)
            pred_tags = model.predict(padded_sentences, sent_lengths)
            sent = WordMapping.decode(sentences)
            pred_tags = TagMapping.decode(pred_tags[0])
            for word, pred_tag in zip(sent, pred_tags):
                writer.writerow([word, pred_tag])

show_confusion_matrix(valid_data_path, output_path, tag_size, TagMapping, 'confusion_matrix_3')