In [2]:
import torch
from torchtext import data
from torchtext import datasets

In [4]:
SEED=1234

In [5]:
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True # 用以保证实验的可重复性，运行的结果完全一致

In [6]:
TEXT=data.Field() #  define how  data should be processed
LABEL=data.LabelField(dtype=torch.torch.float) # a special subset of the Field class specifically used for handling labels. 

In [7]:
# 以下代码自动下载IMDb数据集并将其拆分为torchtext.datasets对象的规范训练/测试拆分
train_data,test_data=datasets.IMDB.splits(TEXT,LABEL)

downloading aclImdb_v1.tar.gz


.data\imdb\aclImdb_v1.tar.gz: 100%|███████████████████████████████████████████████| 84.1M/84.1M [00:30<00:00, 2.80MB/s]


In [8]:
# print(f'{xxxx}')   将{xxxx}中的表达式执行并输出
print(f'number of train examples:{len(train_data)}')
print(f'number of train examples:{len(test_data)}')

number of train examples:25000
number of train examples:25000


In [9]:
print(vars(train_data.examples[0]))

{'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"Teachers".', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', "High's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"Teachers".', 'The', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High.', 'A', 'classic', 'line:', 'INSPECTOR:', "I'm", 'here', 'to', 'sack', 'one', 'of', '

In [11]:
import random
# pass random seed to the random_state argument, 
# ensuring that get the same train/validation split each time
# default this splits 70/30
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [14]:
print(f'Number of train_data:{len(train_data)}')
print(f'Number of valid_data:{len(valid_data)}')
print(f'Number of test_data:{len(test_data)}')

Number of train_data:17500
Number of valid_data:7500
Number of test_data:25000


In [16]:
# The following builds the vocabulary, only keeping the most common max_size tokens.
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data,max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [17]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
# 25002,One of the addition tokens is the <unk> token and the other is a <pad> token.

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [18]:
print(TEXT.vocab.freqs.most_common(50))

[('the', 201637), ('a', 108267), ('and', 106782), ('of', 99982), ('to', 92808), ('is', 72305), ('in', 59801), ('I', 45880), ('that', 45107), ('this', 40094), ('it', 38059), ('/><br', 35752), ('was', 32706), ('as', 29668), ('with', 29022), ('for', 28896), ('The', 23668), ('but', 23604), ('on', 21550), ('movie', 21368), ('are', 20095), ('his', 19270), ('film', 19179), ('have', 19018), ('not', 18433), ('be', 17802), ('you', 17724), ('he', 15115), ('by', 15037), ('at', 14936), ('one', 14485), ('an', 14322), ('from', 13365), ('who', 13143), ('like', 12805), ('all', 12603), ('they', 12476), ('so', 11459), ('has', 11440), ('just', 11439), ('or', 11404), ('about', 11370), ('her', 11077), ('out', 10008), ('some', 9944), ('very', 9237), ('more', 9011), ('This', 8606), ('would', 8247), ('what', 8221)]


In [19]:
type(TEXT.vocab.itos)

list

In [20]:
BATCH_SIZE=64

In [21]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
device

device(type='cuda')

In [26]:
train_iterator,valid_iterator,test_iterator=data.BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size=BATCH_SIZE,
    device=device
)

In [27]:
import torch.nn as nn

In [29]:
class RNN(nn.Module):
    def __init__(self,input_dim,embedding_dim,hidden_dim,output_dim):
        super().__init__()
        self.embedding=nn.Embedding(input_dim,embedding_dim)
        self.rnn=nn.RNN(embedding_dim,hidden_dim)
        self.fc=nn.Linear(hidden_dim,output_dim)
    def forward(self,text):
        # text = [sent len, batch size]
        embedded=self.embedding(text)
        # embedded = [sent len, batch size, emb dim]
        output,hidden=self.rnn(embedded)
        # output = [sent len, batch size, hid dim]
        # hidden = [1, batch size, hid dim]
        
        # 为了得到hidden=[batch size, hid dim],需要squeeze一下，移除纬度为1的维度
        return self.fc(hidden.squeeze(0))

In [30]:
INPUT_DIM=len(TEXT.vocab)
EMBEDDING_DIM=100
HIDDEN_DIM=256
OUTPUT_DIM=1

In [31]:
model=RNN(INPUT_DIM,EMBEDDING_DIM,HIDDEN_DIM,OUTPUT_DIM)

In [33]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,592,105 trainable parameters


In [36]:
# 可学习的参数通过net.parameters()返回，net.named_parameters可同时返回科学系的参数及名称
for name,p in model.named_parameters():
    print(name,' ',p)

embedding.weight   Parameter containing:
tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.5903, -0.1947, -0.2415],
        [ 1.3204,  1.5997, -1.0792,  ...,  0.6060,  0.2209, -0.8245],
        [ 0.7289, -0.7336,  1.5624,  ..., -0.5592, -0.4480, -0.6476],
        ...,
        [ 0.0914,  1.5196,  0.4670,  ...,  0.6393, -0.0332,  0.0185],
        [-0.6290,  0.4650, -0.7165,  ..., -1.3171,  2.0381, -2.0497],
        [-1.1222, -0.0240, -1.0878,  ..., -0.4948, -0.3874,  0.0339]],
       requires_grad=True)
rnn.weight_ih_l0   Parameter containing:
tensor([[ 0.0484, -0.0203,  0.0480,  ..., -0.0512, -0.0010, -0.0363],
        [ 0.0486,  0.0025, -0.0124,  ...,  0.0535,  0.0616,  0.0293],
        [ 0.0418, -0.0329,  0.0084,  ...,  0.0476, -0.0291,  0.0144],
        ...,
        [-0.0199,  0.0285, -0.0489,  ..., -0.0426,  0.0226, -0.0279],
        [ 0.0444,  0.0087,  0.0575,  ..., -0.0464, -0.0326,  0.0446],
        [ 0.0614, -0.0383,  0.0144,  ..., -0.0026, -0.0577, -0.0197]],
       requires_grad=Tr

In [37]:
import torch.optim as optim
optimizer=optim.SGD(model.parameters(),lr=1e-3)

In [38]:
# BCEWithLogitsLoss carries out both the sigmoid and the binary cross entropy steps.
criterion=nn.BCEWithLogitsLoss()

In [39]:
model=model.to(device)
criterion=criterion.to(device)

In [46]:
def accuracy(preds,y):
    rounded_pred=torch.round(torch.sigmoid(preds))
    correct=(rounded_pred==y).float()
    acc=correct.sum()/len(correct)
    return acc

In [42]:
def train(model,iterator,optimizer,critetion):
    epoch_loss=0
    epoch_acc=0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions=model(batch.text).squeeze(1)
        loss=criterion(predictions,batch.label)
        acc=accuracy(predictions,batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss+=loss.item()
        epoch_acc+=acc.item()
    return epoch_loss/len(iterator),epoch_acc/len(iterator)

In [43]:
def evaluate(model,iterator,critetion):
    epoch_loss=0
    epoch_acc=0
    model.eval()
    for batch in iterator:
        predictions=model(batch.text).squeeze(1)
        loss=criterion(predictions,batch.label)
        acc=accuracy(predictions,batch.label)
        epoch_loss+=loss.item()
        epoch_acc+=acc.item()
    return epoch_loss/len(iterator),epoch_acc/len(iterator)

In [47]:
import time
N_EPOCHS=5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss,train_acc=train(model,train_iterator,optimizer,criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: 0.694 | Train Acc: 49.67%
	 Val. Loss: 0.697 |  Val. Acc: 48.95%
	Train Loss: 0.693 | Train Acc: 50.63%
	 Val. Loss: 0.698 |  Val. Acc: 51.03%
	Train Loss: 0.693 | Train Acc: 50.26%
	 Val. Loss: 0.697 |  Val. Acc: 48.61%
	Train Loss: 0.693 | Train Acc: 50.02%
	 Val. Loss: 0.697 |  Val. Acc: 48.78%
	Train Loss: 0.693 | Train Acc: 49.90%
	 Val. Loss: 0.697 |  Val. Acc: 48.83%
