In [1]:
import torch
import torch.nn as nn

In [22]:
x = torch.rand([3,1,2])
x

tensor([[[0.6458, 0.3192]],

        [[0.3522, 0.7835]],

        [[0.3556, 0.8833]]])

In [38]:
a = x[-1].unsqueeze(0)

In [32]:
att = nn.MultiheadAttention(2,1)
att(x,x,x)

(tensor([[[-0.1129,  0.2140]],
 
         [[-0.1108,  0.2099]],
 
         [[-0.1105,  0.2094]]], grad_fn=<AddBackward0>),
 tensor([[[0.3198, 0.3378, 0.3425],
          [0.3386, 0.3309, 0.3305],
          [0.3406, 0.3301, 0.3293]]], grad_fn=<DivBackward0>))

In [39]:
att(a,x,x)

(tensor([[[-0.1105,  0.2094]]], grad_fn=<AddBackward0>),
 tensor([[[0.3406, 0.3301, 0.3293]]], grad_fn=<DivBackward0>))

In [3]:
liner = nn.Linear(300,128)
x = torch.rand(100,128,300)

In [5]:
liner(x).shape

torch.Size([100, 128, 128])

In [9]:
x = torch.rand([2,2,2])
x2 = torch.rand([2,2,2])
x

tensor([[[0.0238, 0.8404],
         [0.4998, 0.7822]],

        [[0.3402, 0.9867],
         [0.6407, 0.2561]]])

In [10]:
x2

tensor([[[0.9050, 0.4375],
         [0.3060, 0.2059]],

        [[0.9984, 0.4515],
         [0.3483, 0.8708]]])

In [11]:
x+x2

tensor([[[0.9287, 1.2779],
         [0.8058, 0.9881]],

        [[1.3386, 1.4382],
         [0.9890, 1.1268]]])

In [30]:
multihead_attn = nn.MultiheadAttention(300, 1)
x  = torch.rand(100,128,300)
liner = nn.Linear(300,1)
output = liner(x)
output.shape

torch.Size([100, 128, 1])

In [31]:
output = output.transpose(0,1)

In [28]:
torch.Tensor([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])

tensor([[[ 1.,  2.,  3.],
         [ 4.,  5.,  6.]],

        [[ 7.,  8.,  9.],
         [10., 11., 12.]]])

In [29]:
torch.Tensor([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]]).transpose(0,1)

tensor([[[ 1.,  2.,  3.],
         [ 7.,  8.,  9.]],

        [[ 4.,  5.,  6.],
         [10., 11., 12.]]])

In [35]:
output.view([-1,100]).shape

torch.Size([128, 100])

In [17]:
x  = torch.rand(10,128,300)
q = torch.rand(1,128,300)

In [18]:
attn_output, attn_output_weights = multihead_attn(q, x, x)

In [19]:
attn_output.shape

torch.Size([1, 128, 300])

In [20]:
attn_output_weights.shape

torch.Size([128, 1, 10])

In [6]:
import numpy as np
import torch
import torch.nn as nn
import re
import jieba
import pandas as pd
from gensim.models import KeyedVectors
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import random_split,DataLoader
import warnings
import torch.optim as optim
import time
warnings.filterwarnings("ignore")
# 使用gensim加载预训练中文分词embedding, 有可能需要等待1-2分钟
cn_model = KeyedVectors.load_word2vec_format('../models/embeddings/sgns.zhihu.bigram', 
                                             binary=False, unicode_errors="ignore")

In [7]:
def getdata(filename,num_words = 50000,max_tokens = 90):
    data = pd.read_csv(filename).sample(3000)
    data = data.to_numpy()
    
    for item in data:
        text = re.sub("[\s+\/_$%^*(+\"\']+|[+——？、~@#￥%……&*（）]+", "", item[0])
        cut = jieba.cut(text)
        cut_list = [i for i in cut]
        for i, word in enumerate(cut_list):
            try:
                cut_list[i] = cn_model.vocab[word].index
            except:
                cut_list[i] = 0
        item[0] = np.array(cut_list)
        
    train_pad = pad_sequences(data[:,0], maxlen=max_tokens,padding='pre', truncating='pre')
    train_pad[ train_pad>=num_words] = 0
    data_set = [(train_pad[i],data[i][1]) for i in range(len(train_pad))]

    return data_set

In [8]:
def embedding_matrix(num_words = 50000,embedding_dim = 300):

    # 初始化embedding_matrix
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for i in range(num_words):
        embedding_matrix[i,:] = cn_model[ cn_model.index2word[i] ]
    embedding_matrix = embedding_matrix.astype('float32')
    return embedding_matrix

In [9]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_size, num_layers=1):
        super().__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size

        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix()))
        self.embedding.requires_grad = False

        self.attention = nn.MultiheadAttention(hidden_size*num_layers,2)
        self.rnn = nn.GRU(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        

    def forward(self, x, h):
        x = self.embedding(x)
        x = x.transpose(0, 1)
        x,h = self.rnn(x,h)

        output,output_Weight = self.attention(h,x,x)
        print(h.shape)
        print(x.shape)
        print(output.shape)
        output = self.linear(output[-1])
        output = self.softmax(output)
        return output

    def initHidden(self):
        h_0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size)
        return h_0

In [5]:
train_db = getdata(filename = '../data/dianping/train.csv',num_words = 50000,max_tokens = 100)
val_db = getdata(filename = '../data/dianping/val.csv',num_words = 50000,max_tokens = 100)
test_db = getdata(filename = '../data/dianping/test.csv',num_words = 50000,max_tokens = 100)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.746 seconds.
Prefix dict has been built successfully.


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Batch_size = 64
N_EPOCHS = 100

net = Net(input_size=300, hidden_size=128, output_size=2, batch_size=Batch_size)
net = net.to(device)
criterion = nn.NLLLoss().to(device)
optimizer = optim.SGD(net.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

print(net)

Net(
  (embedding): Embedding(50000, 300)
  (attention): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
  )
  (rnn): GRU(300, 128)
  (linear): Linear(in_features=128, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=-1)
)


In [16]:
x = torch.zeros(64,100).long().to(device)
h = net.initHidden().to(device)
output = net(x,h)
output.shape

torch.Size([1, 64, 128])
torch.Size([100, 64, 128])
torch.Size([1, 64, 128])


torch.Size([64, 2])

In [53]:
def train(train_db, net, batch_size=20):
    train_loss = 0
    train_acc = 0

    data = DataLoader(train_db, batch_size=batch_size, shuffle=True,drop_last=True)
    epoch = 0
    for i, (text, label) in enumerate(data):
        optimizer.zero_grad()

        text = text.long().to(device)
        label = label.long().to(device)

        h = net.initHidden()
        h = h.to(device)
        output = net(text, h)
        loss = criterion(output, label)

        train_acc += (label.view(-1, 1) == output.topk(1)[1]).sum().item()
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        epoch = epoch + 1

    return train_loss / (epoch*batch_size), train_acc / (epoch*batch_size)


def valid(val_db, net, batch_size=20):
    val_loss = 0
    val_acc = 0

    data = DataLoader(val_db, batch_size=batch_size, shuffle=True,drop_last=True)
    epoch = 0
    for text, label in data:
        with torch.no_grad():
            text = text.long().to(device)
            label = label.long().to(device)

            h = net.initHidden()
            h = h.to(device)
            output = net(text, h)
            loss = criterion(output, label)

            val_acc += (label.view(-1, 1) == output.topk(1)[1]).sum().item()
            val_loss += loss.item()
            epoch = epoch+1

    return val_loss / (epoch*batch_size), val_acc / (epoch*batch_size)

def test(test_db, net, batch_size=20):
    data = DataLoader(test_db, batch_size=batch_size, shuffle=True,drop_last=True)
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for text, label in data:
        with torch.no_grad():
            text = text.long().to(device)
            label = label.long().to(device)

            h = net.initHidden()
            h = h.to(device)
            output = net(text, h)
            prediction = output.topk(1)[1]
            for p, t in zip(prediction.view(-1), label.view(-1)):
                if((p==1) & (t==1)):
                    TP = TP + 1
                elif((p==0) & (t==0)):
                    TN = TN + 1
                elif((p==1) & (t==0)):
                    FP = FP +1
                elif((p==0) & (t==1)):
                    FN = FN +1
    return TP,TN,FP,FN
print(train(train_db, net, batch_size=Batch_size))
print(valid(val_db, net, batch_size=Batch_size))
print(test(test_db, net, batch_size=Batch_size))

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 6.00 GiB total capacity; 317.29 MiB already allocated; 0 bytes free; 4.50 GiB reserved in total by PyTorch)

In [None]:
# from torch.utils.tensorboard import SummaryWriter


# writer = SummaryWriter('../log/test3')
# writer.add_graph(net,(torch.zeros(32,90).long().to(device),net.initHidden().to(device)))
N_EPOCHS = 10
start_time = time.time()
for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(train_db, net, Batch_size)
    valid_loss, valid_acc = valid(val_db, net, Batch_size)
    scheduler.step()

    secs = int(time.time() - start_time)

    mins = secs / 60
    secs = secs % 60
#     writer.add_scalars('Loss', {'train':train_loss,
#                                 'test':valid_loss}, epoch)
#     writer.add_scalars('Acc', {'train':train_acc,
#                                 'test':valid_acc}, epoch)

    print('Epoch: %d' % (epoch + 1), " | time in %d minites, %d seconds" % (mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    
# writer.close()
# torch.save(net.state_dict(), '../models/LSTM.pkl')