In [1]:
from fastNLP import DataSet
import jieba
import pickle
import torch
from fastNLP import Vocabulary



# 1. Preprocessing the data

In [43]:
#Preprocessing the data.
def cnli_preprocessing(path):
    ds = DataSet.read_csv('cnli_data/cnli_train_1.0.txt', 
                      headers=('No', 'Premise', 'Hypotheses', 'Relationship'), sep='\t')
    ds.apply(lambda x: list(jieba.cut(x['Premise'])), new_field_name='premise_words')
    ds.apply(lambda x: list(jieba.cut(x['Hypotheses'])), new_field_name='hypotheses_words')
    def labeling(ins):
        label_str = ins['Relationship'].strip()
        if label_str == 'entailment':
            return 0
        elif label_str == 'neutral':
            return 1
        elif label_str == 'contradiction':
            return 2
        else:
            return -1
    ds.apply(labeling, new_field_name='label', is_target=True)
    return ds

In [44]:
ds_train = cnli_preprocessing('cnli_data/cnli_train_1.0.txt')
ds_dev = cnli_preprocessing('cnli_data/cnli_dev_1.0.txt')
ds_test = cnli_preprocessing('cnli_data/cnli_test_labeled.txt')

In [50]:
with open('cnli_data/cn_ds_train.pkl','wb') as file:
    pickle.dump(ds_train, file)
with open('cnli_data/cn_ds_dev.pkl','wb') as file:
    pickle.dump(ds_dev, file)
with open('cnli_data/cn_ds_test.pkl','wb') as file:
    pickle.dump(ds_test, file)

# 2. Build the vocabulary
Considering that we need all the words to be embedded, words in dev set and test set should be used here.

In [53]:
vocab = Vocabulary(min_freq=2)
ds_train.apply(lambda x: [vocab.add(word) for word in x['premise_words']+x['hypotheses_words']])
ds_dev.apply(lambda x: [vocab.add(word) for word in x['premise_words']+x['hypotheses_words']])
ds_test.apply(lambda x: [vocab.add(word) for word in x['premise_words']+x['hypotheses_words']])

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None, None, None, None, None, None, None, None, None, None, None, None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,

## Get the length of the vocabulary

In [56]:
len(vocab)

31978

# 3. Turn the words into index

In [57]:
# index句子, Vocabulary.to_index(word)
ds_train.apply(lambda x: [vocab.to_index(word) for word in x['premise_words']], 
                 new_field_name='p_seq', is_input=True)
ds_train.apply(lambda x: [vocab.to_index(word) for word in x['hypotheses_words']], 
                 new_field_name='h_seq', is_input=True)
ds_dev.apply(lambda x: [vocab.to_index(word) for word in x['premise_words']], 
                 new_field_name='p_seq', is_input=True)
ds_dev.apply(lambda x: [vocab.to_index(word) for word in x['hypotheses_words']], 
                 new_field_name='h_seq', is_input=True)
ds_test.apply(lambda x: [vocab.to_index(word) for word in x['premise_words']], 
                 new_field_name='p_seq', is_input=True)
ds_test.apply(lambda x: [vocab.to_index(word) for word in x['hypotheses_words']], 
                 new_field_name='h_seq', is_input=True)

In [58]:
with open('cnli_data/cn_ds_train_new.pkl','wb') as file:
    pickle.dump(ds_train, file)
with open('cnli_data/cn_ds_dev_new.pkl','wb') as file:
    pickle.dump(ds_dev, file)
with open('cnli_data/cn_ds_test_new.pkl','wb') as file:
    pickle.dump(ds_test, file)

In [61]:
ds_train[3]

{'No': BE6364,
'Premise': 酒吧后面的两个大女人。,
'Hypotheses': 女人在酒吧内。,
'Relationship': entailment,
'premise_words': ['酒吧', '后面', '的', '两个', '大', '女人', '。'],
'hypotheses_words': ['女人', '在', '酒吧', '内', '。'],
'label': 0,
'p_seq': [334, 194, 3, 20, 137, 16, 2],
'h_seq': [16, 4, 334, 396, 2]}

# 4. Load the pre-loaded word embedding matrix
We use load_embedding to load Chinese Word vector, this would return a pytorch Tensor.

In [65]:
from fastNLP.io.embed_loader import EmbedLoader

In [76]:
# emb_dim = 300
# emb_file = './cnli_data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5'
# emb_matrix = EmbedLoader().load_embedding(emb_dim=emb_dim, emb_file=emb_file, 
#                                          emb_type='glove', vocab=vocab)
# with open('./cnli_data/cn_emb_matrix.pkl', 'wb') as file:
#     pickle.dump(emb_matrix, file)
with open('./cnli_data/cn_emb_matrix.pkl', 'rb') as file:
    emb_matrix= pickle.load(file)

# 5. Load model.py
Build two models. One uses wordembedding, the other does not.

In [116]:
import model

In [None]:
mymodel = model.ESIM(hidden_size = 300, embeds_dim = 300, linear_size = 300, num_word = len(vocab))
mymodel.load_pretrained_glove(emb_matrix[0].numpy())

In [117]:
mymodel_no_embedding = model.ESIM(hidden_size = 300, embeds_dim = 300, linear_size = 300, num_word = len(vocab))

# 6.1 Build trainer and train model with word embedding

In [2]:
from fastNLP import CrossEntropyLoss
from fastNLP import AccuracyMetric
from fastNLP import Trainer
from fastNLP.core.optimizer import Adam

In [106]:
trainer = Trainer(model=mymodel,
                  train_data=ds_train,
                  dev_data=ds_dev,
                  loss=CrossEntropyLoss(pred="pred", target="label"),
                  metrics=AccuracyMetric(),
                  optimizer=Adam(lr=0.0004, weight_decay=0), 
                  n_epochs=10,
                  use_cuda=True)
trainer.train()

training epochs started 2019-01-16 10-53-01


HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=12330), HTML(value='')), layout=Layout(displa…

Epoch 1/10. Step:1233/12330. AccuracyMetric: acc=0.957504
Epoch 2/10. Step:2466/12330. AccuracyMetric: acc=0.957124
Epoch 3/10. Step:3699/12330. AccuracyMetric: acc=0.954715
Epoch 4/10. Step:4932/12330. AccuracyMetric: acc=0.952458
Epoch 5/10. Step:6165/12330. AccuracyMetric: acc=0.956667
Epoch 6/10. Step:7398/12330. AccuracyMetric: acc=0.958645
Epoch 7/10. Step:8631/12330. AccuracyMetric: acc=0.958163
Epoch 8/10. Step:9864/12330. AccuracyMetric: acc=0.952991
Epoch 9/10. Step:11097/12330. AccuracyMetric: acc=0.956845
Epoch 10/10. Step:12330/12330. AccuracyMetric: acc=0.955602


# 6.2 Build trainer and train model without word embedding

In [120]:
#Using no embedding ESIM
trainer_no_embedding = Trainer(model=mymodel_no_embedding,
                  train_data=ds_train,
                  dev_data=ds_dev,
                  loss=CrossEntropyLoss(pred="pred", target="label"),
                  metrics=AccuracyMetric(),
                  optimizer=Adam(lr=0.0004, weight_decay=0), 
                  n_epochs=30,
                  use_cuda=True)
trainer_no_embedding.train()

training epochs started 2019-01-16 12-16-40


HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=36990), HTML(value='')), layout=Layout(displa…

Epoch 1/30. Step:1233/36990. AccuracyMetric: acc=0.862573
Epoch 2/30. Step:2466/36990. AccuracyMetric: acc=0.868506
Epoch 3/30. Step:3699/36990. AccuracyMetric: acc=0.874439
Epoch 4/30. Step:4932/36990. AccuracyMetric: acc=0.87842
Epoch 5/30. Step:6165/36990. AccuracyMetric: acc=0.882882
Epoch 6/30. Step:7398/36990. AccuracyMetric: acc=0.884404
Epoch 7/30. Step:8631/36990. AccuracyMetric: acc=0.885342
Epoch 8/30. Step:9864/36990. AccuracyMetric: acc=0.886889
Epoch 9/30. Step:11097/36990. AccuracyMetric: acc=0.893887
Epoch 10/30. Step:12330/36990. AccuracyMetric: acc=0.8948
Epoch 11/30. Step:13563/36990. AccuracyMetric: acc=0.899059
Epoch 12/30. Step:14796/36990. AccuracyMetric: acc=0.896955
Epoch 13/30. Step:16029/36990. AccuracyMetric: acc=0.903319
Epoch 14/30. Step:17262/36990. AccuracyMetric: acc=0.905576
Epoch 15/30. Step:18495/36990. AccuracyMetric: acc=0.901088
Epoch 16/30. Step:19728/36990. AccuracyMetric: acc=0.905271
Epoch 17/30. Step:20961/36990. AccuracyMetric: acc=0.908086


# 7. Test models

In [2]:
from fastNLP import Tester

In [121]:
tester = Tester(data=ds_test, model=mymodel_no_embedding, metrics=AccuracyMetric())
acc = tester.test()

[tester] 
AccuracyMetric: acc=0.921043


In [5]:
tester = Tester(data=ds_test, model=mymodel, metrics=AccuracyMetric())
acc = tester.test()

[tester] 
AccuracyMetric: acc=0.952103


# 8. Save models

In [None]:
torch.save(mymodel, 'cnli_data/cnli_model.pkl')
torch.save(mymodel_no_embedding, 'cnli_data/cnli_model_embedding.pkl')