In [4]:
## 导入库和config
import csv
import random
import numpy as np
from utls import shuffle_and_split,Bag,Dataloader,Ngram
import layer

random.seed(42)
np.random.seed(42)


# configs：None不使用，否则覆盖下面块的config
ONLY_LONG_SENTENCE = None       #只使用整句(True)还是也使用句子片段

TESTSET_RATE = None             #测试集比例
VALIDSET_RATE = None            #验证集比例
TRAINSET_RATE = None            #训练集比例

VOCAB_LENGTH = None             #bow或Ngram的词典最大总长度
USE_NGRAM = None                #是否使用Ngram
MAXN = None                     #ngram的最大N

BATCH_SIZE = None               #训练、验证、测试集的batchsize

流程：

1.处理数据<br>
2.分割数据集<br>
3.初始化word2vec<br>
4.用数据集和word2vec设置dataloader

In [None]:
## 读取数据，并简单处理
#只使用句子还是也使用句子片段
only_long_sentence = False if ONLY_LONG_SENTENCE==None else ONLY_LONG_SENTENCE  


#读取数据
with open(r'.\data\train.tsv') as f:
    tsvreader = csv.reader(f, delimiter='\t')
    temp = list(tsvreader)
print("文件预览：")
print(temp[:5])
print(temp[-5:])

#裁剪
data = temp[1:]
if only_long_sentence:
    print("config: 只保留完整句子")
    data = [data[i] for i in range(len(data)) 
            if (i==0 or data[i][1]!=data[i-1][1])]
else:
    print("config: 使用全部句子")

#文字转数值
for i in range(len(data)):
    data[i][0] = int(data[i][0])
    data[i][1] = int(data[i][1])
    data[i][3] = int(data[i][3])

print("data预览：")
print(*data[:3],sep='\n')


In [None]:
## 数据集划分
testset_rate = 0.2 if TESTSET_RATE==None else TESTSET_RATE
trainset_rate = 0.7*0.8 if TRAINSET_RATE==None else TRAINSET_RATE
validset_rate = 0.3*0.8 if VALIDSET_RATE==None else VALIDSET_RATE


test_data, temp = shuffle_and_split(data,
                        testset_rate/(testset_rate+trainset_rate+validset_rate))
train_data, valid_data = shuffle_and_split(temp,
                        trainset_rate/(trainset_rate+validset_rate)) 

data_set = (train_data, test_data, valid_data)
print("test_data length:",len(test_data))
print("train_data length:",len(train_data))
print("valid_data length:",len(valid_data))

In [None]:
## 创建word2vec
vocab_length = 400 if VOCAB_LENGTH==None else VOCAB_LENGTH
use_ngram = False if USE_NGRAM==None else USE_NGRAM
maxN = 2 if MAXN==None else MAXN


word2vec, inputlen = None,None
if not use_ngram:
    #使用bow
    sentences_train = [data[2] for data in train_data]
    word2vec = Bag(sentences_train, vocab_length)
    inputlen = word2vec.get_vocab_size()
else:
    #使用ngram
    sentences_train = [data[2] for data in train_data]
    gram_vocab_len = vocab_length//maxN
    word2vec = Ngram(sentences_train, maxN, gram_vocab_len)
    inputlen = word2vec.get_vocab_size()

print("使用bow" if not use_ngram else "使用Ngram")
print("词表总长度为:",inputlen)

In [None]:
## 设置dataloader
batch_size = 1000 if BATCH_SIZE==None else BATCH_SIZE


input_transformer = [
    word2vec.trans_to_tensor,
    lambda x:np.reshape(x,(x.shape[0],1,-1))
                   ]
label_transformer = [lambda x:np.array(x,dtype=np.int32)]

input_train = [data[2] for data in train_data]
label_train = [data[3] for data in train_data]
debug_train = [data[0] for data in train_data]

input_valid = [data[2] for data in valid_data]
label_valid = [data[3] for data in valid_data]
debug_valid = [data[0] for data in valid_data]

input_test = [data[2] for data in test_data]
label_test = [data[3] for data in test_data]

train_loader = Dataloader(batch_size, 
                          input_train, label_train, debug_train,
                          input_transformer, label_transformer)
valid_loader = Dataloader(batch_size, 
                          input_valid, label_valid, debug_valid,
                          input_transformer, label_transformer)
test_loader = Dataloader(batch_size, input_test, label_test,
                         input_transform=input_transformer, label_transform=label_transformer)

print("      batch数量, batch大小")
print("train:",len(train_loader),train_loader._batchsize())
print("valid:",len(valid_loader),valid_loader._batchsize())
print("test :",len(test_loader),test_loader._batchsize())

In [None]:
## 模型
input_lenght = inputlen
output_length = 5
param = None


class MyLinearModel:
    """耦合度较高，必须先forward，再getloss，再backward"""
    def __init__(self,input,output,params=None):
        self.linear = layer.Linear(input,output)
        self.softmax = layer.SoftmaxAndCrossEntropy()
        self.input_len = input

        if params:
            self.linear.load_param(*params)
        else:
            self.linear.init_param()

    def forward(self,x):
        """返回softmax以后的"""
        x = self.linear.forward(x)
        x = self.softmax.forward(x)
        return x
    def getloss(self,label):
        """返回loss"""
        return self.softmax.get_loss(label)
    def backward(self, learning_rate):
        """根据learning_rate反向传播"""
        mid_stream = self.softmax.backward()
        mid_stream = self.linear.backward(mid_stream)
        self.linear.update_param(learning_rate)

model = MyLinearModel(input_lenght,output_length)

In [None]:
## 训练
epoch = 300
lr = 0.03
valid_interval = 200
display_interval = 50


trainnum = 0
for ep in range(epoch):
    batch_num = len(train_loader)
    for i in range(batch_num):
        input_tensor, lable, _ = train_loader[i]
        soft_outp = model.forward(input_tensor)
        model.getloss(lable)
        model.backward(lr)
        trainnum+=1
        if trainnum%display_interval == 0:
            soft_outp = soft_outp.reshape(-1,soft_outp.shape[-1])
            print("batch:",i)
            print("softmax的结果:",soft_outp[:5],sep='\n')
            print("softmax预测:",np.argmax(soft_outp[:5],axis=1))
            print("lable:",lable[:5])

            ans = np.argmax(soft_outp,axis=1)
            right = np.sum(ans==lable)
            print("accuracy: ",right/ans.shape[0])
        if trainnum%valid_interval == 0:
            valid_batch_num = len(valid_loader)
            valid_right_num = 0
            for i in range(valid_batch_num):
                input_tensor, lable, debug = valid_loader[i]
                soft_outp = model.forward(input_tensor)
                ans = np.argmax(soft_outp.reshape(-1,soft_outp.shape[-1]), axis=1)
                valid_right_num += np.sum(ans == lable)
            print("---------------------------------")
            print("验证Accuracy:", 
                valid_right_num/(len(valid_loader)*valid_loader._batchsize()))
            print("---------------------------------")

In [None]:
## 测试
test_batch_num = len(test_loader)
test_accuracy = 0
for i in range(test_batch_num):
    input_tensor, lable, debug = test_loader[i]
    soft_outp = model.forward(input_tensor)
    ans = np.argmax(soft_outp.reshape(-1,soft_outp.shape[-1]), axis=1)
    test_accuracy += np.sum(ans == lable)
print("测试Accuracy:", 
      test_accuracy/(len(test_loader)*test_loader._batchsize()))