In [1]:
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary



In [51]:
data_train=DataSet.read_csv("data_train.txt",headers=('a','text','label'))
data_test=DataSet.read_csv("data_test.txt",headers=('a','text','label'))

将文本小写，将label转化为INT格式

In [53]:
data_train.apply(lambda x: int(x['label']),new_field_name='label')
data_train.apply(lambda x: x['text'].lower(), new_field_name='text')
data_test.apply(lambda x: int(x['label']),new_field_name='label')
data_test.apply(lambda x: x['text'].lower(), new_field_name='text')


分词

In [54]:
def split_sent(instance):
    return instance['text'].split()

In [55]:
data_train.apply(split_sent,new_field_name='description_words')
data_test.apply(split_sent,new_field_name='description_words')

In [56]:
data_train

DataSet({'a': 0 type=str,
'text': wall st. bears claw back into the black (reuters) reuters - short-sellers  wall street's dwindling\band of ultra-cynics  are seeing green again. type=str,
'label': 1 type=int,
'description_words': ['wall', 'st.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(reuters)', 'reuters', '-', 'short-sellers', 'wall', "street's", 'dwindling\\band', 'of', 'ultra-cynics', 'are', 'seeing', 'green', 'again.'] type=list},
{'a': 1 type=str,
'text': carlyle looks toward commercial aerospace (reuters) reuters - private investment firm carlyle group \which has a reputation for making well-timed and occasionally\controversial plays in the defense industry  has quietly placed\its bets on another part of the market. type=str,
'label': 1 type=int,
'description_words': ['carlyle', 'looks', 'toward', 'commercial', 'aerospace', '(reuters)', 'reuters', '-', 'private', 'investment', 'firm', 'carlyle', 'group', '\\which', 'has', 'a', 'reputation', 'for', 'making', 'well-time

分词后的长度

In [57]:
data_train.apply(lambda x: len(x['description_words']),new_field_name='description_seq_len')
data_test.apply(lambda x: len(x['description_words']),new_field_name='description_seq_len')

计算最长的长度

In [58]:
max_seq_len_train=0
max_seq_len_test=0
for i in range (len(data_train)):
    if(data_train[i]['description_seq_len'] > max_seq_len_train):
        max_seq_len_train = data_train[i]['description_seq_len']
    else:
        pass
for i in range (len(data_test)):
    if(data_test[i]['description_seq_len'] > max_seq_len_test):
        max_seq_len_test = data_test[i]['description_seq_len']
    else:
        pass

In [59]:
max_sentence_length = max_seq_len_train
if (max_seq_len_test > max_sentence_length):
    max_sentence_length = max_seq_len_test
print ('max_sentence_length:',max_sentence_length)

max_sentence_length: 177


将分词作为输入

In [60]:
data_train.set_input("description_words")
data_test.set_input("description_words")

设置label为输出

In [61]:
data_train.set_target("label")
data_test.set_target("label")

根据训练集来建立词典

In [62]:
vocab = Vocabulary(min_freq=2)
data_train.apply(lambda x:[vocab.add(word) for word in x['description_words']])
vocab.build_vocab()

将词典的index索引替换单词

In [63]:
data_train.apply(lambda x: [vocab.to_index(word) for word in x['description_words']],new_field_name='description_words')
data_test.apply(lambda x: [vocab.to_index(word) for word in x['description_words']],new_field_name='description_words')

In [65]:
data_train

DataSet({'a': 0 type=str,
'text': wall st. bears claw back into the black (reuters) reuters - short-sellers  wall street's dwindling\band of ultra-cynics  are seeing green again. type=str,
'label': 1 type=int,
'description_words': [386, 455, 1663, 15976, 102, 54, 2, 839, 29, 91, 10, 50794, 386, 6893, 1, 5, 50795, 35, 3937, 760, 2619] type=list,
'description_seq_len': 21 type=int},
{'a': 1 type=str,
'text': carlyle looks toward commercial aerospace (reuters) reuters - private investment firm carlyle group \which has a reputation for making well-timed and occasionally\controversial plays in the defense industry  has quietly placed\its bets on another part of the market. type=str,
'label': 1 type=int,
'description_words': [17218, 1011, 794, 1220, 4211, 29, 91, 10, 889, 761, 307, 17218, 89, 31956, 20, 4, 4650, 9, 528, 50796, 7, 1, 2128, 6, 2, 504, 253, 20, 3938, 1, 6809, 8, 189, 323, 5, 2, 1290] type=list,
'description_seq_len': 37 type=int},
{'a': 2 type=str,
'text': oil and economy cloud

将长度不满足最长长度的句子用“0”来填充

In [26]:
def padding_words(data):
    for i in range(len(data)):
        if data[i]['description_seq_len'] <= max_sentence_length:
            padding = [0] * (max_sentence_length - data[i]['description_seq_len'])
            data[i]['description_words'] += padding
        else:
            pass
    return data

In [66]:
data_train= padding_words(data_train)
data_test = padding_words(data_test)
data_train.apply(lambda x: len(x['description_words']), new_field_name='description_seq_len')
data_test.apply(lambda x: len(x['description_words']), new_field_name='description_seq_len')

In [67]:
data_train

DataSet({'a': 0 type=str,
'text': wall st. bears claw back into the black (reuters) reuters - short-sellers  wall street's dwindling\band of ultra-cynics  are seeing green again. type=str,
'label': 1 type=int,
'description_words': [386, 455, 1663, 15976, 102, 54, 2, 839, 29, 91, 10, 50794, 386, 6893, 1, 5, 50795, 35, 3937, 760, 2619, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] type=list,
'description_seq_len': 177 type=int},
{'a': 1 type=str,
'text': carlyle looks toward commercial aerospace (reuters) reuters - private investment firm carlyle group \which has a reputati

文档重命名

In [79]:
data_train.rename_field("description_words","description_word_seq")
data_train.rename_field("label","label_seq")
data_test.rename_field("description_words","description_word_seq")
data_test.rename_field("label","label_seq")

KeyError: 'DataSet has no field named description_words.'

In [80]:

data_train.set_input("description_word_seq")
data_test.set_input("description_word_seq")
data_train.set_target("label_seq")
data_test.set_target("label_seq")

In [70]:
data_train

DataSet({'a': 0 type=str,
'text': wall st. bears claw back into the black (reuters) reuters - short-sellers  wall street's dwindling\band of ultra-cynics  are seeing green again. type=str,
'description_seq_len': 177 type=int,
'description_word_seq': [386, 455, 1663, 15976, 102, 54, 2, 839, 29, 91, 10, 50794, 386, 6893, 1, 5, 50795, 35, 3937, 760, 2619, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] type=list,
'label_seq': 1 type=int},
{'a': 1 type=str,
'text': carlyle looks toward commercial aerospace (reuters) reuters - private investment firm carlyle group \which has a r

In [69]:
print("dataset processed successfully!")

dataset processed successfully!


模型实现

In [71]:
import torch
import torch.nn as nn

class ResnetBlock(nn.Module):
    def __init__(self, channel_size):
        super(ResnetBlock, self).__init__()

        self.channel_size = channel_size
        self.maxpool = nn.Sequential(
            nn.ConstantPad1d(padding=(0, 1), value=0),
            nn.MaxPool1d(kernel_size=3, stride=2)
        )
        self.conv = nn.Sequential(
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),

            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),
        )

    def forward(self, x):
        x_shortcut = self.maxpool(x)
        x = self.conv(x_shortcut)
        x = x + x_shortcut
        return x


class DPCNN(nn.Module):
    def __init__(self,max_features,word_embedding_dimension,max_sentence_length,num_classes):
        super(DPCNN, self).__init__()
        self.max_features = max_features
        self.embed_size = word_embedding_dimension
        self.maxlen = max_sentence_length
        self.num_classes=num_classes
        self.channel_size = 250

        self.embedding = nn.Embedding(self.max_features, self.embed_size)
        torch.nn.init.normal_(self.embedding.weight.data,mean=0,std=0.01)
        self.embedding.weight.requires_grad = True

        # region embedding
        self.region_embedding = nn.Sequential(
            nn.Conv1d(self.embed_size, self.channel_size, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        self.conv_block = nn.Sequential(
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),
        )

训练

In [75]:
word_embedding_dimension = 300
num_classes = 4
pickle_path = 'result/'

In [84]:
from fastNLP import Trainer
from copy import deepcopy
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric
from fastNLP.core.optimizer import Adam
from fastNLP.core.utils import save_pickle


# load model
model=DPCNN(max_features=len(vocab),word_embedding_dimension=word_embedding_dimension,max_sentence_length = max_sentence_length,num_classes=num_classes)

# define loss and metric
loss = CrossEntropyLoss(pred="output",target="label_seq")
metric = AccuracyMetric(pred="predict", target="label_seq")

# train model with train_data,and val model witst_data
# embedding=300 gaussian init，weight_decay=0.0001, lr=0.001，epoch=5
trainer=Trainer(model=model,train_data=data_train,dev_data=data_test,loss=loss,metrics=metric,save_path=None,batch_size=32,n_epochs=5,optimizer=Adam(lr=0.001, weight_decay=0.0001))
trainer.train()

# save pickle
save_pickle(model,pickle_path=pickle_path,file_name='new_model.pkl')

input fields after batch(if batch size is 2):
	description_word_seq: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 177]) 
target fields after batch(if batch size is 2):
	label_seq: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 



NotImplementedError: 