# Model Training and Testing
**Agend**

0. Config

1. Model Training/模型训练

2. Model Testing /模型测试

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
notebook_dir = "/content/drive/My Drive/Colab Notebooks/RelationClassifyCNN/"

In [3]:
import sys
sys.path.append(notebook_dir)

In [4]:
import torch
import os
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
from evaluate import Eval
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
def set_seed(seed=1234):
    os.environ['PYTHONHASHSEED'] = '{}'.format(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)  # set seed for cpu
    torch.cuda.manual_seed(seed)  # set seed for current gpu
    torch.cuda.manual_seed_all(seed)  # set seed for all gpu

In [6]:
set_seed()

In [7]:
class Config(object):
    def __init__(self):
        pass

In [8]:
config = Config()

In [9]:
config.cuda = 0

In [10]:
config.device = None
if config.cuda >= 0 and torch.cuda.is_available():
    config.device = torch.device('cuda:{}'.format(config.cuda))
else:
    config.device = torch.device('cpu')

In [11]:
config.data_dir = notebook_dir+"data"

In [12]:
config.output_dir = notebook_dir+"output"

In [13]:
config.embedding_path = notebook_dir+"embedding/glove.6B.50d.txt"

In [14]:
config.word_dim = 50

In [15]:
config.model_name = "CNN"
config.mode=1
config.seed = 666
config.cuda =0
config.epoch=5

config.dropout=0.3
config.batch_size=64
config.lr=0.001
config.max_len=96
config.pos_dis=20
config.pos_dim=5
config.hidden_size=64

# hyper parameters for cnn
config.filter_num=32
config.window=3
config.L2_decay=0.0001


In [16]:
config.model_dir = os.path.join(config.output_dir,config.model_name)
if not os.path.exists(config.model_dir):
    os.makedirs(config.model_dir)

In [17]:
for key in config.__dict__:
    print(key, end=' = ')
    print(config.__dict__[key])

cuda = 0
device = cpu
data_dir = /content/drive/My Drive/Colab Notebooks/RelationClassifyCNN/data
output_dir = /content/drive/My Drive/Colab Notebooks/RelationClassifyCNN/output
embedding_path = /content/drive/My Drive/Colab Notebooks/RelationClassifyCNN/embedding/glove.6B.50d.txt
word_dim = 50
model_name = CNN
mode = 1
seed = 666
epoch = 5
dropout = 0.3
batch_size = 64
lr = 0.001
max_len = 96
pos_dis = 20
pos_dim = 5
hidden_size = 64
filter_num = 32
window = 3
L2_decay = 0.0001
model_dir = /content/drive/My Drive/Colab Notebooks/RelationClassifyCNN/output/CNN


In [18]:
from dataset import processor,WordEmbeddingLoader,RelationLoader,SemEvalDateset,SemEvalDataLoader

#### 1) Data Preprocess

In [19]:
data_processor=processor()

In [20]:
# sample_size = 2000
sample_size = None

In [21]:
#train data
data_processor.convert(notebook_dir+"data/TRAIN_FILE.TXT",notebook_dir+'data/train.json',sample_size)

100%|██████████| 8000/8000 [00:02<00:00, 3499.20it/s]


In [22]:
# Test data
data_processor.convert(notebook_dir+"data/TEST_FILE_FULL.TXT",notebook_dir+'data/test.json',sample_size)

100%|██████████| 2717/2717 [00:00<00:00, 3563.35it/s]


#### 2) Load WordEmbedding

In [23]:
emd_loader=WordEmbeddingLoader(config)
word2id, word_vec=emd_loader.load_embedding()

#### 3) Load Relation Map

In [24]:
rel_loader=RelationLoader(config)
rel2id, id2rel, class_num=rel_loader.get_relation()

#### 4) Load Train Data

In [25]:
# # ! head data/relation2id.txt
# from collections import namedtuple
# conf = namedtuple('conf',['max_len','pos_dis','data_dir'])
# config = conf(32,5,"data")
filename='train.json'
train_dataset =SemEvalDateset(filename, rel2id, word2id,config)

In [26]:
next(iter(train_dataset))

((array([[[400001,    279,     20,    980,   1070,     32,     48,   2606,
             3251,      7,     30,  40634,  11465,      4,  13874,   2623,
                3,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0],
          [     9,     10,     11,     12,     13,     14,     15,     16,
               17,     1

#### 5) Pytorch Data Loader

In [27]:
data_loader = SemEvalDataLoader(rel2id, word2id, config)

### 1) Train Function

In [28]:
def train(model, criterion, loader, config):
    
    train_loader, dev_loader, _ = loader
    
    optimizer = optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.L2_decay)

    print(model)
    print('traning model parameters:')
    for name, param in model.named_parameters():
        if param.requires_grad:
            print('%s :  %s' % (name, str(param.data.shape)))
    print('--------------------------------------')
    print('start to train the model ...')

    eval_tool = Eval(config)
    min_f1 = -float('inf')
    for epoch in range(1, config.epoch+1):
        for step, (data, label) in enumerate(train_loader):
            model.train()

            sent_feat = data[0].to(config.device)
            lex_feat = data[1].to(config.device)
            data = (sent_feat, lex_feat)
            #data = data.to(config.device)
            label = label.to(config.device)

            optimizer.zero_grad()
            logits = model(data)
            loss = criterion(logits, label)
            loss.backward()
            optimizer.step()

        _, train_loss, _ = eval_tool.evaluate(model, criterion, train_loader)
        f1, dev_loss, _ = eval_tool.evaluate(model, criterion, dev_loader)

        print('[%03d] train_loss: %.3f | dev_loss: %.3f | macro f1 on dev: %.4f'
              % (epoch, train_loss, dev_loss, f1), end=' ')
        if f1 > min_f1:
            min_f1 = f1
            torch.save(model.state_dict(), os.path.join(config.model_dir, 'model.pkl'))
            print('>>> save models!')
        else:
            print()


def test(model, criterion, loader, config):
    print('--------------------------------------')
    print('start test ...')
    _, _, test_loader = loader
    model.load_state_dict(torch.load(os.path.join(config.model_dir, 'model.pkl')))
    eval_tool = Eval(config)
    f1, test_loss, predict_label = eval_tool.evaluate(model, criterion, test_loader)
    print('test_loss: %.3f | macro f1 on test:  %.4f' % (test_loss, f1))
    return predict_label

In [29]:
train_loader, dev_loader = None, None
if config.mode == 1:  # train mode
    train_loader = data_loader.get_train()
    dev_loader = data_loader.get_dev()
    
test_loader = data_loader.get_test()
loader = [train_loader, dev_loader, test_loader]

In [30]:
#### import model
from model import CNN

In [31]:
class_num

19

In [32]:
model = CNN(word_vec=word_vec, class_num=class_num, config=config)
model = model.to(config.device)
criterion = nn.CrossEntropyLoss()

In [33]:

def print_result(predict_label, id2rel, start_idx=8001):
    with open(notebook_dir+'output/predicted_result.txt', 'w', encoding='utf-8') as fw:
        for i in range(0, predict_label.shape[0]):
            # print('{}\t{}\n'.format(start_idx+i, id2rel[int(predict_label[i])]))
            fw.write('{}\t{}\n'.format(start_idx+i, id2rel[int(predict_label[i])]))

In [34]:
if config.mode == 1:  # train mode
    train(model, criterion, loader, config)
predict_label = test(model, criterion, loader, config)
print_result(predict_label, id2rel)

CNN(
  (word_embedding): Embedding(400002, 50)
  (pos1_embedding): Embedding(43, 5)
  (pos2_embedding): Embedding(43, 5)
  (conv): Conv2d(1, 32, kernel_size=(3, 60), stride=(1, 1), padding=(1, 0), bias=False)
  (maxpool): MaxPool2d(kernel_size=(96, 1), stride=(96, 1), padding=0, dilation=1, ceil_mode=False)
  (tanh): Tanh()
  (dropout): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=32, out_features=64, bias=False)
  (dense): Linear(in_features=364, out_features=19, bias=False)
)
traning model parameters:
word_embedding.weight :  torch.Size([400002, 50])
pos1_embedding.weight :  torch.Size([43, 5])
pos2_embedding.weight :  torch.Size([43, 5])
conv.weight :  torch.Size([32, 1, 3, 60])
linear.weight :  torch.Size([64, 32])
dense.weight :  torch.Size([19, 364])
--------------------------------------
start to train the model ...
[001] train_loss: 1.636 | dev_loss: 1.701 | macro f1 on dev: 0.5131 >>> save models!
[002] train_loss: 1.148 | dev_loss: 1.266 | macro f1 on dev: 0.6

In [38]:
test_data,label = next(iter(dev_loader))

In [41]:
test_pred = model(test_data)

In [46]:
test_pred = np.argmax(test_pred.detach().numpy(),1)

In [49]:
test_pred

array([15, 12, 12,  7,  2,  3,  0,  0,  0, 15,  7, 17,  7,  0,  9,  0,  7,
        0, 14, 17, 15,  5, 17,  0, 10,  0,  0,  0, 15,  0,  1, 15,  3,  2,
        3,  4, 14,  5, 14,  2,  0,  4,  0,  7,  9,  5,  5,  5,  0,  0,  7,
        0, 14, 14,  0,  9,  7,  2,  0,  4, 10, 17,  0, 13])

In [51]:
label = label.numpy()

In [53]:
acuracy = sum(test_pred==label)/len(label)

In [54]:
acuracy

0.609375