In [1]:
import torch
import random
import numpy as np 

SEED=1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
from transformers import BertTokenizer
# loading the pre-trained bert-base-uncased tokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# tokenizer has vocab contains the actual vocabulary
len(tokenizer.vocab)

30522

In [4]:
tokens=tokenizer.tokenize('Fuck You')
print(tokens)

['fuck', 'you']


In [5]:
indexes=tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[6616, 2017]


In [6]:
init_token=tokenizer.cls_token
eos_token=tokenizer.sep_token
pad_token=tokenizer.pad_token
unk_token=tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [7]:
init_token_idx=tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx=tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx=tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx=tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [8]:
init_token_idx=tokenizer.cls_token_id
eos_token_idx=tokenizer.sep_token_id
pad_token_idx=tokenizer.pad_token_id
unk_token_idx=tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [9]:
max_input_length=tokenizer.max_model_input_sizes['bert-base-uncased']

# the maximum length of these input sizes:512 tokens(maximum length)
print(max_input_length)

512


In [10]:
# our maximum length is 2 less than the actual maximum length because
# we need to append two tokens to each sequence one to the start and one to the end.
def tokenize_and_cut(sentence):
    tokens=tokenizer.tokenize(sentence)
    tokens=tokens[:max_input_length-2]
    return tokens

In [11]:
from torchtext import data

# transformer expects the batch dimension to be first, so set batch_first = True
# preprocessing argument is a function that takes in the example after it has been tokenized
TEXT=data.Field(batch_first=True,
                use_vocab=False,
                tokenize=tokenize_and_cut,
                preprocessing=tokenizer.convert_tokens_to_ids,
                init_token=init_token_idx,
                eos_token=eos_token_idx,
                pad_token=pad_token_idx,
                unk_token=unk_token_idx)

LABEL=data.LabelField(dtype=torch.float)

In [12]:
from torchtext import datasets

train_data,test_data=datasets.IMDB.splits(TEXT,LABEL)

train_data,valid_data=train_data.split(random_state=random.seed(SEED))

In [13]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [14]:
# vars()返回对象object的属性和属性值的字典对象。
print(type(train_data.examples[6]))
print(vars(train_data.examples[6])) 
print(len(train_data.examples[6].text))

<class 'torchtext.data.example.Example'>
{'text': [1996, 18458, 1997, 6644, 9016, 4627, 2066, 2009, 2453, 2031, 2242, 2000, 3749, 1012, 1037, 2177, 1997, 2267, 13496, 2044, 4399, 1006, 1999, 1996, 2991, 1029, 1007, 3632, 2000, 1037, 7001, 6644, 1999, 1996, 5249, 2073, 2028, 2011, 2028, 2027, 2024, 4457, 2011, 2019, 16100, 5771, 5983, 7865, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 6854, 1010, 1996, 2034, 20423, 2003, 2073, 2151, 6556, 3787, 1997, 2143, 3737, 2644, 1012, 6644, 9016, 2003, 2210, 2062, 2084, 2267, 4268, 2559, 2005, 3348, 1010, 22017, 4371, 1010, 3331, 2512, 1011, 2644, 2055, 2498, 1010, 1998, 3773, 2129, 2116, 1042, 1011, 9767, 2027, 2064, 2131, 2046, 1015, 1024, 2871, 2781, 2030, 2174, 2146, 2023, 6752, 2003, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 4268, 2552, 1998, 10509, 5236, 2135, 2000, 2673, 2105, 2068, 1012, 2028, 1997, 2068, 2005, 6013, 9418, 2008, 1996, 3096, 7865, 2038, 10372, 2014, 3456, 1010, 2061, 2054, 2515, 2016, 2079, 1029, 

In [15]:
tokens=tokenizer.convert_ids_to_tokens(train_data.examples[6].text)
print(tokens)

['the', 'premise', 'of', 'cabin', 'fever', 'starts', 'like', 'it', 'might', 'have', 'something', 'to', 'offer', '.', 'a', 'group', 'of', 'college', 'teens', 'after', 'finals', '(', 'in', 'the', 'fall', '?', ')', 'goes', 'to', 'a', 'resort', 'cabin', 'in', 'the', 'woods', 'where', 'one', 'by', 'one', 'they', 'are', 'attacked', 'by', 'an', 'unseen', 'flesh', 'eating', 'virus', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'unfortunately', ',', 'the', 'first', 'paragraph', 'is', 'where', 'any', 'remote', 'elements', 'of', 'film', 'quality', 'stop', '.', 'cabin', 'fever', 'is', 'little', 'more', 'than', 'college', 'kids', 'looking', 'for', 'sex', ',', 'boo', '##ze', ',', 'talking', 'non', '-', 'stop', 'about', 'nothing', ',', 'and', 'seeing', 'how', 'many', 'f', '-', 'bombs', 'they', 'can', 'get', 'into', '1', ':', '40', 'minutes', 'or', 'however', 'long', 'this', 'mess', 'is', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'the', 'kids', 'act', 'and', 'react', 'stupid', '##ly', 'to', 'ev

In [16]:
LABEL.build_vocab(train_data)

In [17]:
print(type(LABEL.vocab.stoi))
print(LABEL.vocab.stoi)

<class 'collections.defaultdict'>
defaultdict(None, {'neg': 0, 'pos': 1})


In [18]:
# use the largest batch size that we can as I've found this gives the best results for transformers.
BATCH_SIZE=128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [51]:
# BATCH_SIZE=128,maximum length=512,一个batch有128个句子，每个句子最多512个词
batch=next(iter(train_iterator))
print(batch.text.shape)
print(batch.label.shape)

torch.Size([128, 512])
torch.Size([128])


In [19]:
device

device(type='cuda')

In [21]:
#  load the pre-trained model,load the same model as we did for the tokenizer.
from transformers import BertTokenizer,BertModel
bert=BertModel.from_pretrained('bert-base-uncased')

In [28]:
import torch.nn as nn

class BERTGRUSA(nn.Module):
    
    def __init__(self,bert,hidden_dim,output_dim,n_layers,bidirectional,dropout):
        super().__init__()
        self.bert=bert
        
        embedding_dim=bert.config.to_dict()['hidden_size']
        
        # Dropout layer on outputs of each GRU layer except last layer
        self.rnn=nn.GRU(embedding_dim,
                        hidden_dim,
                        num_layers=n_layers,
                        bidirectional=bidirectional,
                        batch_first=True,                    
                        dropout=0 if n_layers<2 else dropout)
        self.out=nn.Linear(hidden_dim*2 if bidirectional else hidden_dim,output_dim)
        self.dropout=nn.Dropout(dropout)
    def forward(self,text):
        # text = [batch size, sent len]  
        with torch.no_grad():
            embedded = self.bert(text)[0]               
        # embedded = [batch size, sent len, emb dim]    
        # output是最后一层所有隐藏元的值，hidden是所有层最后一个时间步的值
        _, hidden = self.rnn(embedded) # 不需要output       
        # hidden = [num_layers * num_directions, batch, hidden_size]        
        if self.rnn.bidirectional:
            # 正向最后一层的h，反向最后一层的h
            # 0：竖着拼，1：横着拼
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])               
        # hidden = [batch size, hid dim*num directions]        
        output = self.out(hidden)       
        # output = [batch size, out dim]        
        return output

In [29]:
HIDDEN_DIM=256
OUTPUT_DIM=1
N_LAYERS=2
BIDIRECTIONAL=True
DROPOUT=0.25

model=BERTGRUSA(bert,
                HIDDEN_DIM,
                OUTPUT_DIM,
                N_LAYERS,
                BIDIRECTIONAL,
                DROPOUT)

In [33]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'{count_parameters(model)} trainable parameters')

112241409 trainable parameters


In [35]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [36]:
print(f'{count_parameters(model)} trainable parameters')

2759169 trainable parameters


In [38]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [39]:
import torch.optim as optim
optimizer=optim.Adam(model.parameters())

In [42]:
# 把Sigmoid和BCELoss合成一步
criterion=nn.BCEWithLogitsLoss()

In [43]:
model=model.to(device)
criterion=criterion.to(device)

In [44]:
def binary_accuracy(preds, y):
    # Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    # round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [52]:
# batch.lable.shape=torch.Size([128]),predictions的第二维度如果是1就去掉
def train(model,iterator,optimizer,criterion):
    epoch_loss=0
    epoch_acc=0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions=model(batch.text).squeeze(1)
        loss=criterion(predictions,batch.label)
        acc=binary_accuracy(predictions,batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss+=loss.item()
        epoch_acc+=acc.item()
    return epoch_loss/len(iterator),epoch_acc/len(iterator)

In [53]:
def evaluate(model,iterator,criterion):
    epoch_loss=0
    epoch_acc=0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions=model(batch.text).squeeze(1)
            loss=criterion(predictions,batch.label)
            acc=binary_accuracy(predictions,batch.label)
            epoch_loss+=loss.item()
            epoch_acc+=acc.item()
        return epoch_loss/len(iterator),epoch_acc/len(iterator)

In [54]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [55]:
N_EPOCHS=5
best_valid_loss=float('inf')
for epoch in range(N_EPOCHS):
    start_time=time.time()
    train_loss,train_acc=train(model,train_iterator,optimizer,criterion)
    valid_loss,valid_acc=evaluate(model,valid_iterator,criterion)
    end_time=time.time()
    epoch_mins,epoch_secs=epoch_time(start_time,end_time)
    if valid_loss<best_valid_loss:
         best_valid_loss = valid_loss
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

RuntimeError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 0; 4.00 GiB total capacity; 1.55 GiB already allocated; 1.40 GiB free; 1.60 GiB reserved in total by PyTorch)