In [1]:
## 导入库和config
import csv
import random
import numpy as np
from utls import shuffle_and_split,Bag,Dataloader
import layer

random.seed(42)
np.random.seed(42)

流程：

1.处理数据<br>
2.分割数据集<br>
3.初始化word2vec<br>
4.用数据集和word2vec设置dataloader

In [2]:
## 读取数据，并简单处理
ONLY_LONG_SENTENCE = False  #只使用句子还是也使用句子片段


#读取数据
with open(r'.\data\train.tsv') as f:
    tsvreader = csv.reader(f, delimiter='\t')
    temp = list(tsvreader)
print("文件预览：")
print(temp[:5])
print(temp[-5:])

#裁剪
data = temp[1:]
if ONLY_LONG_SENTENCE:
    print("config: 只保留完整句子")
    data = [data[i] for i in range(len(data)) 
            if (i==0 or data[i][1]!=data[i-1][1])]
else:
    print("config: 使用全部句子")

#文字转数值
for i in range(len(data)):
    data[i][0] = int(data[i][0])
    data[i][1] = int(data[i][1])
    data[i][3] = int(data[i][3])

print("data预览：")
print(*data[:3],sep='\n')


文件预览：
[['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], ['1', '1', 'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', '1'], ['2', '1', 'A series of escapades demonstrating the adage that what is good for the goose', '2'], ['3', '1', 'A series', '2'], ['4', '1', 'A', '2']]
[['156056', '8544', "Hearst 's", '2'], ['156057', '8544', 'forced avuncular chortles', '1'], ['156058', '8544', 'avuncular chortles', '3'], ['156059', '8544', 'avuncular', '2'], ['156060', '8544', 'chortles', '2']]
config: 使用全部句子
data预览：
[1, 1, 'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', 1]
[2, 1, 'A series of escapades demonstrating the adage that what is good for the goose', 2]
[3, 1, 'A series', 2]


In [3]:
## 数据集划分
testset_rate, trainset_rate, validset_rate = 0.2,0.7,0.3


test_data, temp = shuffle_and_split(data,testset_rate)
train_data, valid_data = shuffle_and_split(temp,
                        trainset_rate/(trainset_rate+validset_rate)) 

data_set = (train_data, test_data, valid_data)
print("test_data length:",len(test_data))
print("train_data length:",len(train_data))
print("valid_data length:",len(valid_data))

test_data length: 300
train_data length: 840
valid_data length: 360


In [4]:
## 创建词袋模型word2vec
max_vocab_length = 400


sentences_train = [data[2] for data in train_data]
word2vec = Bag(sentences_train,max_vocab_length)
inputlen = word2vec.get_vocab_size()

inputlen

400

In [5]:
## 设置dataloader
batch_size = 1000
input_transformer = [
    word2vec.trans_to_tensor,
    lambda x:np.reshape(x,(x.shape[0],1,-1))
                   ]
label_transformer = [lambda x:np.array(x,dtype=np.int32)]


input_train = [data[2] for data in train_data]
label_train = [data[3] for data in train_data]
debug_train = [data[0] for data in train_data]

input_valid = [data[2] for data in valid_data]
label_valid = [data[3] for data in valid_data]
debug_valid = [data[0] for data in valid_data]

input_test = [data[2] for data in test_data]
label_test = [data[3] for data in test_data]

train_loader = Dataloader(batch_size, 
                          input_train, label_train, debug_train,
                          input_transformer, label_transformer)
valid_loader = Dataloader(batch_size, 
                          input_valid, label_valid, debug_valid,
                          input_transformer, label_transformer)
test_loader = Dataloader(batch_size, input_test, label_test,
                         input_transform=input_transformer, label_transform=label_transformer)

print("      batch数量, batch大小")
print("train:",len(train_loader),train_loader._batchsize())
print("valid:",len(valid_loader),valid_loader._batchsize())
print("test :",len(test_loader),test_loader._batchsize())

      batch数量, batch大小
train: 1 700
valid: 1 300
test : 0 700


In [6]:
## 模型
input_lenght = inputlen
output_length = 5
param = None


class MyLinearModel:
    """耦合度较高，必须先forward，再getloss，再backward"""
    def __init__(self,input,output,params=None):
        self.linear = layer.Linear(input,output)
        self.softmax = layer.SoftmaxAndCrossEntropy()
        self.input_len = input

        if params:
            self.linear.load_param(*params)
        else:
            self.linear.init_param()

    def forward(self,x):
        """返回softmax以后的"""
        x = self.linear.forward(x)
        x = self.softmax.forward(x)
        return x
    def getloss(self,label):
        """返回loss"""
        return self.softmax.get_loss(label)
    def backward(self, learning_rate):
        """根据learning_rate反向传播"""
        mid_stream = self.softmax.backward()
        mid_stream = self.linear.backward(mid_stream)
        self.linear.update_param(learning_rate)

model = MyLinearModel(input_lenght,output_length)

linear层init完成
Softmax层init完成


In [23]:
## 训练
epoch = 10
lr = 0.01
display_frequence = 50

allnum = 0
for ep in range(epoch):
    batch_num = len(train_loader)
    for i in range(batch_num):
        input_tensor, lable, _ = train_loader[i]
        soft_outp = model.forward(input_tensor)
        model.getloss(lable)
        model.backward(lr)
        allnum+=1
        if allnum%display_frequence == 0:
            soft_outp = soft_outp.reshape(-1,soft_outp.shape[-1])
            print("batch:",i)
            print("softmax的结果:",soft_outp[:5],sep='\n')
            print("softmax预测:",np.argmax(soft_outp[:5],axis=1))
            print("lable:",lable[:5])

            ans = np.argmax(soft_outp,axis=1)
            right = np.sum(ans==lable)
            print("accuracy: ",right/ans.shape[0])

batch: 0
softmax的结果:
[[1.0416577e-02 8.7355994e-02 7.0195788e-01 1.8563959e-01 1.4629932e-02]
 [8.0389192e-04 4.4858340e-02 9.1334939e-01 2.5073273e-02 1.5915178e-02]
 [5.5029998e-03 8.3085075e-02 8.4435928e-01 3.0281398e-02 3.6771260e-02]
 [2.0759597e-03 9.9396449e-01 3.6632768e-03 2.5360971e-05 2.7095727e-04]
 [3.1365906e-03 7.6682523e-02 7.6308370e-01 1.2623909e-01 3.0858070e-02]]
softmax预测: [2 2 2 1 2]
lable: [2 2 2 1 2]
accuracy:  0.81
batch: 0
softmax的结果:
[[1.0374169e-02 8.7234281e-02 7.0214790e-01 1.8562384e-01 1.4619758e-02]
 [7.9968228e-04 4.4748135e-02 9.1355997e-01 2.5007430e-02 1.5884832e-02]
 [5.4722871e-03 8.2783580e-02 8.4474409e-01 3.0128408e-02 3.6871579e-02]
 [2.0474251e-03 9.9402726e-01 3.6337420e-03 2.4969529e-05 2.6666696e-04]
 [3.1207225e-03 7.6482087e-02 7.6357192e-01 1.2602222e-01 3.0803056e-02]]
softmax预测: [2 2 2 1 2]
lable: [2 2 2 1 2]
accuracy:  0.8114285714285714
batch: 0
softmax的结果:
[[1.0332100e-02 8.7113313e-02 7.0233661e-01 1.8560840e-01 1.4609595e-02]
 [

In [24]:
## 验证
valid_batch_num = len(valid_loader)
valid_accuracy = 0
for i in range(valid_batch_num):
    input_tensor, lable, debug = valid_loader[i]
    soft_outp = model.forward(input_tensor)
    ans = np.argmax(soft_outp.reshape(-1,soft_outp.shape[-1]), axis=1)
    valid_accuracy += np.sum(ans == lable)
print("验证Accuracy:", 
      valid_accuracy/(len(valid_loader)*valid_loader._batchsize()))

验证Accuracy: 0.6666666666666666
