# 网络测试

In [1]:
import numpy as np
import torch
import torch.nn as nn
import re
import jieba
import pandas as pd
from gensim.models import KeyedVectors
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import random_split,DataLoader
import warnings
import torch.optim as optim
import time
warnings.filterwarnings("ignore")
# 使用gensim加载预训练中文分词embedding, 有可能需要等待1-2分钟
cn_model = KeyedVectors.load_word2vec_format('../models/embeddings/sgns.zhihu.bigram', 
                                             binary=False, unicode_errors="ignore")

In [2]:
def getdata(filename,num_words = 50000,max_tokens = 90):
    data = pd.read_csv(filename).sample(10000)
    data = data.to_numpy()
    
    for item in data:
        text = re.sub("[\s+\/_$%^*(+\"\']+|[+——？、~@#￥%……&*（）]+", "", item[0])
        cut = jieba.cut(text)
        cut_list = [i for i in cut]
        for i, word in enumerate(cut_list):
            try:
                cut_list[i] = cn_model.vocab[word].index
            except:
                cut_list[i] = 0
        item[0] = np.array(cut_list)
        
    train_pad = pad_sequences(data[:,0], maxlen=max_tokens,padding='pre', truncating='pre')
    train_pad[ train_pad>=num_words] = 0
    data_set = [(train_pad[i],data[i][1]) for i in range(len(train_pad))]

    return data_set

In [3]:
def embedding_matrix(num_words = 50000,embedding_dim = 300):

    # 初始化embedding_matrix
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for i in range(num_words):
        embedding_matrix[i,:] = cn_model[ cn_model.index2word[i] ]
    embedding_matrix = embedding_matrix.astype('float32')
    return embedding_matrix

In [26]:
def train(train_db, net, batch_size=20):
    train_loss = 0
    train_acc = 0

    data = DataLoader(train_db, batch_size=batch_size, shuffle=True,drop_last=True)

    for i, (text, label) in enumerate(data):
        optimizer.zero_grad()

        text = text.long()
        label = label.long()

        h = net.initHidden()
        output = net(text, h)
        loss = criterion(output, label)

        train_acc += (label.view(-1, 1) == output.topk(1)[1]).sum().item()
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    return train_loss / len(train_db), train_acc / len(train_db)

In [82]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_size, num_layers=1,num_words = 50000):
        super().__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size

        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix(num_words=num_words)))
        self.embedding.requires_grad = False

        self.attention = nn.MultiheadAttention(hidden_size * num_layers, 1)
        self.rnn = nn.GRU(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size*sequence_len, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x, h):
        x = self.embedding(x)
        x = x.transpose(0, 1)
        x, h = self.rnn(x, h)

        output, output_Weight = self.attention(x, x, x)
        output = output.transpose(0, 1)
        output = output.reshape(-1,output.shape[1]*output.shape[2])
        output = self.linear(output)
        output = self.softmax(output)
        return output

    def initHidden(self):
        h_0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size)
        return h_0

In [49]:
train_db = getdata('../data/testData_10w.csv')

1

In [83]:
Batch_size = 32
N_EPOCHS = 100
sequence_len = 90
data = DataLoader(train_db, batch_size=Batch_size, shuffle=True,drop_last=True)
net = Net(input_size=300, hidden_size=128, output_size=2, batch_size=Batch_size)
criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
print(net)

Net(
  (embedding): Embedding(50000, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
  )
  (rnn): GRU(300, 128)
  (linear): Linear(in_features=11520, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=-1)
)


In [84]:
for i in data:
    result = net(i[0].long(),net.initHidden())
    print(result.shape)
    break

torch.Size([32, 11520])
torch.Size([32, 2])


In [29]:
train_loss, train_acc = train(train_db, net, Batch_size)
print(train_loss)
print(train_acc)

0.005407605475187302
0.4979


0.005410297530889511
0.4949


# 模块测试

In [56]:
x = torch.randn(10,100,300)
multihead_attn = nn.MultiheadAttention(300, 2)
attn_output, attn_output_weights = multihead_attn(x, x, x)

In [58]:
attn_output.shape[2]

300

# 数据测试

In [4]:
import pandas as pd
data = pd.read_csv('../data/testData_10w.csv').sample(10)
data

Unnamed: 0,comment,rating
15627,不是无脑特效突突突，有点内涵,1
21885,夏洛，特烦恼。幸福在身边,1
11107,老公我爱你,1
98282,看完什么都不会剩下的电影。,0
23902,太棒了，夏雨真是颠覆，演得真好，真是转粉了剧情也甩其他电影几条街。当然身为berger的真...,1
19930,从国产商业片的角度来说算良心佳片，从国产探险片来说，是巅峰,1
39183,有一种神秘的微笑叫闪电。全程精彩，同行伙伴还被一个镜头吓到撒了爆米花哈哈哈～,1
63942,之前看那么多人说这部终于把青春疼痛故事拍好了，一看发现真不该信这个邪。“青春疼痛”就是个定...,0
63698,左边的阿姨笑的昏天黑地 右边的妹子自己咔咔啃苹果 我真后悔我没带包子去吃 观众的笑声更好笑...,0
86486,2.5/10.很多人欠星爷电影票，但这部《美人鱼》之后，星爷就欠很多人电影票了,0


# 模块测试

In [1]:
import numpy as np
import torch
import torch.nn as nn

In [16]:
liner = nn.Linear(100*300,1)

In [26]:
x = torch.randn(100,32,300)

In [30]:
x = x.transpose(0, 1)
x = x.view(-1,100*300)
x.shape

torch.Size([32, 30000])

In [32]:
liner(x).shape

torch.Size([32, 1])

In [77]:
import torch
a=torch.Tensor([[[1,2,3],[4,5,6]],[[1,2,3],[4,5,6]]])
a.shape

torch.Size([2, 2, 3])

In [78]:
a.reshape(-1,6)

tensor([[1., 2., 3., 4., 5., 6.],
        [1., 2., 3., 4., 5., 6.]])