In [1]:
import numpy as np
import torch
import torch.nn as nn
import re
import jieba
import pandas as pd
from gensim.models import KeyedVectors
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import random_split,DataLoader
import warnings
import torch.optim as optim
import time
warnings.filterwarnings("ignore")
# 使用gensim加载预训练中文分词embedding, 有可能需要等待1-2分钟
cn_model = KeyedVectors.load_word2vec_format('../models/embeddings/sgns.zhihu.bigram', 
                                             binary=False, unicode_errors="ignore")

# 处理数据

In [2]:
def getdata(filename,num_words = 50000,max_tokens = 90):
    data = pd.read_csv(filename).sample(3000)
    data = data.to_numpy()
    
    for item in data:
        text = re.sub("[\s+\/_$%^*(+\"\']+|[+——？、~@#￥%……&*（）]+", "", item[0])
        cut = jieba.cut(text)
        cut_list = [i for i in cut]
        for i, word in enumerate(cut_list):
            try:
                cut_list[i] = cn_model.vocab[word].index
            except:
                cut_list[i] = 0
        item[0] = np.array(cut_list)
        
    train_pad = pad_sequences(data[:,0], maxlen=max_tokens,padding='pre', truncating='pre')
    train_pad[ train_pad>=num_words] = 0
    data_set = [(train_pad[i],data[i][1]) for i in range(len(train_pad))]

    return data_set

# 初始化embedding_matrix

In [3]:
def embedding_matrix(num_words = 50000,embedding_dim = 300):

    # 初始化embedding_matrix
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for i in range(num_words):
        embedding_matrix[i,:] = cn_model[ cn_model.index2word[i] ]
    embedding_matrix = embedding_matrix.astype('float32')
    return embedding_matrix

# 网络

In [4]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_size, num_layers=1):
        super().__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size

        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix()))
        self.embedding.requires_grad = False

        self.attention = nn.MultiheadAttention(hidden_size*num_layers,2)
        self.rnn = nn.GRU(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        

    def forward(self, x, h):
        x = self.embedding(x)
        x = x.transpose(0, 1)
        x,h = self.rnn(x,h)

        output,output_Weight = self.attention(x,x,x)
        output = self.linear(output[-1])
        output = self.softmax(output)
        return output

    def initHidden(self):
        h_0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size)
        return h_0

In [10]:
# val_db = getdata(filename = '../data/testData_10w.csv',num_words = 50000,max_tokens = 100)


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Batch_size = 2

# net = Net(input_size=300, hidden_size=128, output_size=2, batch_size=Batch_size)
# net = net
# criterion = nn.NLLLoss()
# input = None
# data = DataLoader(val_db, batch_size=Batch_size, shuffle=True,drop_last=True)
# for i in data:
#     input = i

In [11]:
# output = net(input[0].long(),net.initHidden()).transpose(0,1)
# output.shape


In [12]:
# output

# 超参数

In [5]:
train_db = getdata(filename = '../data/shopping/train.csv',num_words = 50000,max_tokens = 100)
val_db = getdata(filename = '../data/shopping/val.csv',num_words = 50000,max_tokens = 100)
test_db = getdata(filename = '../data/shopping/test.csv',num_words = 50000,max_tokens = 100)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.751 seconds.
Prefix dict has been built successfully.


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Batch_size = 128
N_EPOCHS = 100

net = Net(input_size=300, hidden_size=128, output_size=2, batch_size=Batch_size)
net = net.to(device)
criterion = nn.NLLLoss().to(device)
optimizer = optim.SGD(net.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

print(net)

Net(
  (embedding): Embedding(50000, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
  )
  (rnn): GRU(300, 128)
  (linear): Linear(in_features=128, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=-1)
)


In [13]:
# nn.init.orthogonal_(net.rnn.weights,1)


# 训练

In [7]:
def train(train_db, net, batch_size=20):
    train_loss = 0
    train_acc = 0

    data = DataLoader(train_db, batch_size=batch_size, shuffle=True,drop_last=True)
    epoch = 0
    for i, (text, label) in enumerate(data):
        optimizer.zero_grad()

        text = text.long().to(device)
        label = label.long().to(device)

        h = net.initHidden()
        h = h.to(device)
        output = net(text, h)
        loss = criterion(output, label)

        train_acc += (label.view(-1, 1) == output.topk(1)[1]).sum().item()
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        epoch = epoch + 1

    return train_loss / (epoch*batch_size), train_acc / (epoch*batch_size)


def valid(val_db, net, batch_size=20):
    val_loss = 0
    val_acc = 0

    data = DataLoader(val_db, batch_size=batch_size, shuffle=True,drop_last=True)
    epoch = 0
    for text, label in data:
        with torch.no_grad():
            text = text.long().to(device)
            label = label.long().to(device)

            h = net.initHidden()
            h = h.to(device)
            output = net(text, h)
            loss = criterion(output, label)

            val_acc += (label.view(-1, 1) == output.topk(1)[1]).sum().item()
            val_loss += loss.item()
            epoch = epoch+1

    return val_loss / (epoch*batch_size), val_acc / (epoch*batch_size)

def test(test_db, net, batch_size=20):
    data = DataLoader(test_db, batch_size=batch_size, shuffle=True,drop_last=True)
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for text, label in data:
        with torch.no_grad():
            text = text.long().to(device)
            label = label.long().to(device)

            h = net.initHidden()
            h = h.to(device)
            output = net(text, h)
            prediction = output.topk(1)[1]
            for p, t in zip(prediction.view(-1), label.view(-1)):
                if((p==1) & (t==1)):
                    TP = TP + 1
                elif((p==0) & (t==0)):
                    TN = TN + 1
                elif((p==1) & (t==0)):
                    FP = FP +1
                elif((p==0) & (t==1)):
                    FN = FN +1
    return TP,TN,FP,FN
print(train(train_db, net, batch_size=Batch_size))
print(valid(val_db, net, batch_size=Batch_size))
print(test(test_db, net, batch_size=Batch_size))

(0.005437492433449496, 0.477241847826087)
(0.005437221094641996, 0.4813179347826087)
(1248, 142, 1351, 203)


In [20]:
# from torch.utils.tensorboard import SummaryWriter


# writer = SummaryWriter('../log/test3')
# writer.add_graph(net,(torch.zeros(32,90).long().to(device),net.initHidden().to(device)))
N_EPOCHS = 10
start_time = time.time()
for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(train_db, net, Batch_size)
    valid_loss, valid_acc = valid(val_db, net, Batch_size)
    scheduler.step()

    secs = int(time.time() - start_time)

    mins = secs / 60
    secs = secs % 60
#     writer.add_scalars('Loss', {'train':train_loss,
#                                 'test':valid_loss}, epoch)
#     writer.add_scalars('Acc', {'train':train_acc,
#                                 'test':valid_acc}, epoch)

    print('Epoch: %d' % (epoch + 1), " | time in %d minites, %d seconds" % (mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    
# writer.close()
# torch.save(net.state_dict(), '../models/LSTM.pkl')

Epoch: 1  | time in 0 minites, 0 seconds
	Loss: 0.0055(train)	|	Acc: 49.5%(train)
	Loss: 0.0055(valid)	|	Acc: 47.8%(valid)
Epoch: 2  | time in 0 minites, 1 seconds
	Loss: 0.0055(train)	|	Acc: 49.5%(train)
	Loss: 0.0055(valid)	|	Acc: 47.8%(valid)
Epoch: 3  | time in 0 minites, 2 seconds
	Loss: 0.0055(train)	|	Acc: 49.3%(train)
	Loss: 0.0055(valid)	|	Acc: 47.4%(valid)
Epoch: 4  | time in 0 minites, 2 seconds
	Loss: 0.0055(train)	|	Acc: 49.4%(train)
	Loss: 0.0055(valid)	|	Acc: 46.9%(valid)
Epoch: 5  | time in 0 minites, 3 seconds
	Loss: 0.0055(train)	|	Acc: 48.7%(train)
	Loss: 0.0055(valid)	|	Acc: 46.9%(valid)
Epoch: 6  | time in 0 minites, 4 seconds
	Loss: 0.0055(train)	|	Acc: 48.7%(train)
	Loss: 0.0055(valid)	|	Acc: 46.5%(valid)
Epoch: 7  | time in 0 minites, 4 seconds
	Loss: 0.0055(train)	|	Acc: 47.6%(train)
	Loss: 0.0055(valid)	|	Acc: 46.2%(valid)
Epoch: 8  | time in 0 minites, 5 seconds
	Loss: 0.0054(train)	|	Acc: 47.3%(train)
	Loss: 0.0055(valid)	|	Acc: 45.8%(valid)
Epoch: 9  | time

In [21]:
data = DataLoader(val_db, batch_size=128, shuffle=True,drop_last=True)
TP = 0
TN = 0
FP = 0
FN = 0
for text, label in data:
    with torch.no_grad():
        text = text.long().to(device)
        label = label.long().to(device)
        h = net.initHidden()
        h = h.to(device)
        output = net(text, h)
        prediction = output.topk(1)[1]
        for p, t in zip(prediction.view(-1), label.view(-1)):
            if((p==1) & (t==1)):
                TP = TP + 1
            elif((p==0) & (t==0)):
                TN = TN + 1
            elif((p==1) & (t==0)):
                FP = FP +1
            elif((p==0) & (t==1)):
                FN = FN +1
                
print('TP:'+str(TP)+' TN:' + str(TN) +' FP:'+ str(FP) +' FN:' + str(FN))
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('accuracy:' + str(accuracy) + '\nprecision:' + str(precision) + '\nrecall:'+str(recall))


TP:168 TN:1172 FP:261 FN:1343
accuracy:0.45516304347826086
precision:0.3916083916083916
recall:0.11118464592984778
