<a href="https://colab.research.google.com/github/Heiseweiye/-huffman/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch
!pip install torchtext
!python -m spacy download en


# K80 gpu for 12 hours
import torch
from torch import nn, optim
from torchtext.legacy import datasets
from torchtext.legacy import data

print('GPU:', torch.cuda.is_available())

torch.manual_seed(123)

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
GPU: True


<torch._C.Generator at 0x7f84058ebc70>

In [None]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 43.6MB/s]


In [None]:
print('len of train data:', len(train_data))
print('len of test data:', len(test_data))

len of train data: 25000
len of test data: 25000


In [None]:
print(train_data.examples[15].text)
print(train_data.examples[15].label)

['I', 'liked', 'this', 'movie', 'a', 'lot', '.', 'It', 'really', 'intrigued', 'me', 'how', 'Deanna', 'and', 'Alicia', 'became', 'friends', 'over', 'such', 'a', 'tragedy', '.', 'Alicia', 'was', 'just', 'a', 'troubled', 'soul', 'and', 'Deanna', 'was', 'so', 'happy', 'just', 'to', 'see', 'someone', 'after', 'being', 'shot', '.', 'My', 'only', 'complaint', 'was', 'that', 'in', 'the', 'beginning', 'it', 'was', 'kind', 'of', 'slow', 'and', 'it', 'took', 'awhile', 'to', 'get', 'to', 'the', 'basis', 'of', 'things', '.', 'Other', 'than', 'that', 'it', 'was', 'great', '.']
pos


In [None]:
# word2vec, glove
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)


batchsz = 30
device = torch.device('cuda')
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size = batchsz,
    device=device
)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399999/400000 [00:21<00:00, 18851.67it/s]


In [None]:
class RNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        """
        """
        super(RNN, self).__init__()
        
        # [0-10001] => [100]
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # [100] => [256]
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, 
                           bidirectional=True, dropout=0.5)
        # [256*2] => [1]
        self.fc = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(0.5)
        
        
    def forward(self, x):
        """
        x: [seq_len, b] vs [b, 3, 28, 28]
        """
        # [seq, b, 1] => [seq, b, 100]
        embedding = self.dropout(self.embedding(x))
        
        # output: [seq, b, hid_dim*2]
        # hidden/h: [num_layers*2, b, hid_dim]
        # cell/c: [num_layers*2, b, hid_di]
        output, (hidden, cell) = self.rnn(embedding)
        
        # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        
        # [b, hid_dim*2] => [b, 1]
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        
        return out

In [None]:
rnn = RNN(len(TEXT.vocab), 100, 256)

pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape)
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')

optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(device)
rnn.to(device)


pretrained_embedding: torch.Size([10002, 100])
embedding layer inited.


RNN(
  (embedding): Embedding(10002, 100)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
import numpy as np

def binary_acc(preds, y):
    """
    get accuracy
    """
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(rnn, iterator, optimizer, criteon):
    
    avg_acc = []
    rnn.train()
    
    for i, batch in enumerate(iterator):

        # [seq, b] => [b, 1] => [b]
        pred = rnn(batch.text).squeeze(1)
        # 
        loss = criteon(pred, batch.label)
        acc = binary_acc(pred, batch.label).item()
        avg_acc.append(acc)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i%10 == 0:
            print(i, acc)
        
    avg_acc = np.array(avg_acc).mean()
    print('avg acc:', avg_acc)
    
    
def eval(rnn, iterator, criteon):
    
    avg_acc = []
    
    rnn.eval()
    
    with torch.no_grad():
        for batch in iterator:

            # [b, 1] => [b]
            pred = rnn(batch.text).squeeze(1)

            #
            loss = criteon(pred, batch.label)

            acc = binary_acc(pred, batch.label).item()
            avg_acc.append(acc)
        
    avg_acc = np.array(avg_acc).mean()
    
    print('>>test:', avg_acc)
        
    
    

In [None]:
for epoch in range(10):
    
    eval(rnn, test_iterator, criteon)
    train(rnn, train_iterator, optimizer, criteon)

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
        [1487,  248,   86,  ...,    4,    4,    4],
        [   4,   39,    4,  ...,    1,    1,    1]], device='cuda:0')
tensor([[ 318,   11,  555,  ...,   66,  772,   11],
        [  11, 1055,   12,  ...,  141,    3,   56],
        [ 106,   21,   15,  ...,   53,   95, 1994],
        ...,
        [   2,  238, 2813,  ...,  956,   14,  777],
        [  38, 1258,   29,  ...,   58,  123,   22],
        [   4,    4,    4,  ...,   58,    4,   39]], device='cuda:0')
tensor([[  11,   66,   11,  ...,   11,   66,   11],
        [  34,    9,  143,  ...,  237,    0,  237],
        [ 124,   38,    2,  ...,   16,  736,   16],
        ...,
        [ 176,  758, 1428,  ...,  343,   91, 1749],
        [  12,    4,    0,  ...,   12, 2150,   39],
        [   4, 2430,    4,  ...,    4,    4,   39]], device='cuda:0')
tensor([[  66,   11,   25,  ...,   11,  318,  640],
        [   9,  130,    0,  ...,  204,   11,  113],
        [  65,    6,    7,  ...,   16, 7835,   

KeyboardInterrupt: ignored