In [8]:
## 导入库和config
import csv
import random
import numpy as np
from utls import shuffle_and_split,Bag,Dataloader
import layer

random.seed(42)
np.random.seed(42)

USE_PHRASE = False # 只使用句子还是也使用句子片段

流程：

1.处理数据<br>
2.初始化word2vec<br>
3.分割数据集<br>
4.用数据集和word2vec设置dataloader

In [None]:
## 读取数据，并简单处理
with open(r'.\data\train.tsv') as f:
    tsvreader = csv.reader(f, delimiter='\t')
    temp = list(tsvreader)
print(temp[:5])
print(temp[-5:])

data = temp[1:]
if not USE_PHRASE:
    data = [data[i] for i in range(len(data)) 
            if (i==0 or data[i][1]!=data[i-1][1]) ] 
print(*data[:5],sep='\n')

for i in range(len(data)):
    data[i][1] = int(data[i][1])
    data[i][3] = int(data[i][3])
print(*data[:5],sep='\n')    

In [None]:
## 使用词袋模型
sentences = [idata[2] for idata in data]
sentences = sentences[0:512]

vocab = Bag(sentences)
vocab.init_vocab()
inputlen = vocab.get_vocab_len()
inputlen

In [None]:
## dataloader
train_rate = 0.8
batch_size = 48

train_data, valid_data = shuffle_and_split(data,train_rate)
print(len(train_data),len(valid_data))

train_loader = Dataloader(batch_size,vocab,train_data)
valid_loader = Dataloader(batch_size,vocab,valid_data) #也是train，因为要用标签

In [None]:
## 模型
class MyLinearModel:
    """耦合度较高，必须先forward，再getloss，再backward"""
    def __init__(self,input,output,lr):
        self.linear = layer.Linear(input,output)
        self.softmax = layer.SoftmaxAndCrossEntropy()
        self.input_len = input
        self.last_input_batchsize = None
        self.learningrate = lr

        self.linear.init_param()

    def forward(self,x):
        """返回softmax以后的"""
        assert isinstance(x,np.ndarray)
        assert len(x.shape) == 2 and x.shape[1] == self.input_len
        self.last_input_batchsize = x.shape[0]
        x = self.linear.forward(x)
        x = self.softmax.forward(x)
        return x
    def getloss(self,label):
        """返回loss"""
        assert label.shape[0] == self.last_input_batchsize
        return self.softmax.get_loss(label)
    def backward(self):
        mid_stream = self.softmax.backward()
        mid_stream = self.linear.backward(mid_stream)
        self.linear.update_param(self.learningrate)

lr = 0.0001
model = MyLinearModel(inputlen,5,lr)

In [None]:
## 训练
epoch = 10

for ep in range(epoch):
    batch_num = train_loader.get_batch_nums()
    for i in range(batch_num):
        input_tensor,lable = train_loader[i]
        soft_outp = model.forward(input_tensor)
        model.getloss(lable)
        model.backward()
        if i%10 == 0:
            soft_outp = soft_outp.reshape(-1,soft_outp.shape[-1])
            print("batch:",i)
            print("softmax的结果:",soft_outp[:5],sep='\n')
            print("lable:",lable[:5])
            
            ans = np.argmax(soft_outp,axis=1)

            right = np.sum(ans==lable)
            print("accuracy：",right/ans.shape[0])

# # 验证
valid_batch_num = valid_loader.get_batch_nums()
valid_accuracy = 0
for i in range(valid_batch_num):
    input_tensor, lable = valid_loader[i]
    soft_outp = model.forward(input_tensor)
    ans = np.argmax(soft_outp.reshape(-1,soft_outp.shape[-1]), axis=1)
    valid_accuracy += np.sum(ans == lable)
print(f"Validation Accuracy: {valid_accuracy / (valid_batch_num * batch_size):.4f}")