In [12]:
import torch
import torchvision
from torch.utils import data
from torchvision import transforms
import torch.nn as nn
from torch.nn import functional as F
import collections
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import argparse
from PIL import Image
import re
import os
import math

In [13]:
def read_time_machine():
    with open('./time_machine.txt', 'r') as f:
        lines = f.readlines()
    format_lines = [re.sub('[^A-Za-z]', ' ', line).strip().lower() for line in lines]
    lines = [line for line in format_lines if len(line)>0]
    return lines


def tokenizer(data,method='word'):
    ret = []
    if method=='word':
        for line in data:
            ret.extend(line.split())
        return ret
    elif method=='char':
        for line in data:
            ret.extend([i for i in line if i.isalnum()])
        ret = sorted(list(set(ret)))
        return ret


def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

time_machine_token = tokenizer(read_time_machine(),'word')
print(len(time_machine_token))

36019


In [14]:
class Vocab():
    def __init__(self,tokens,min_frequence=0,reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = count_corpus(tokens)
        self.__token_freq = sorted(counter.items(),key=lambda x:x[1],reverse=True)
        self.index_to_token = ['<unk>'] + reserved_tokens
        self.unk = 0
        self.token_to_index = {token:idx for idx,token in enumerate(self.index_to_token)}
        for token,freq in self.token_freq:
            if freq <min_frequence:
                break
            if token not in self.index_to_token:
                self.index_to_token.append(token)
                self.token_to_index[token] = len(self.index_to_token) - 1
    
    def __len__(self):
        return len(self.index_to_token)
    
    def to_index(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_index.get(tokens,self.unk)
        return [self.to_index(token) for token in tokens]

    def to_token(self,indices):
        if not isinstance(indices,(tuple,list)):
            return self.index_to_token[indices]
        return [self.to_token(index) for index in indices]

    @property
    def token_freq(self):
        return self.__token_freq

time_machine_corpus = Vocab(time_machine_token)

In [15]:
# paired_token = [pair for pair in zip(token[:-1],token[1:])]
# paired_corpus = Vocab(paired_token)
# for i in range(10):
#     print(paired_corpus.token_freq[i])

# tri_token = [pair for pair in zip(token[:-2],token[1:-1],token[2:])]
# tri_corpus = Vocab(tri_token)
# for i in range(10):
#     print(tri_corpus.token_freq[i])

In [16]:
# fig,axes = plt.subplots(2,2)
# x = [i for i in range(len(time_machine_corpus)-1)]
# y = [time_machine_corpus.token_freq[j][1] for j in x]
# print(y)
# print(time_machine_corpus.token_freq)
# axes[0][0].plot(x,x)
# axes[0][1].plot(x,y)
# axes[1][1].plot(x,y)
# axes[1][1].semilogx()
# axes[1][1].semilogy()
# plt.show()

In [17]:
def seq_data_iter_random(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0, num_steps - 1):]
    num_subseqs = (len(corpus)-1) // num_steps
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    random.shuffle(initial_indices)

    def data(pos):
        return corpus[pos: pos + num_steps]
    
    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i: i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)


def seq_data_iter_sequential(corpus, batch_size, num_steps):
    offset = random.randint(0, num_steps - 1)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    Xs = torch.tensor(corpus[offset: offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1] // num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i: i + num_steps]
        Y = Ys[:, i: i + num_steps]
        yield X, Y

In [18]:
class SeqDataLoader:
    def __init__(self, token ,batch_size, num_steps, use_random_iter):
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else:
            self.data_iter_fn = seq_data_iter_sequential
        self.batch_size, self.num_steps = batch_size, num_steps
        self.token_index = time_machine_corpus.to_index(token)


    def __iter__(self):
        return self.data_iter_fn(self.token_index, self.batch_size, self.num_steps)
    
    def __len__(self):
        return len(self.token_index)
    
data_iter = SeqDataLoader(time_machine_token,batch_size=1,num_steps=10,use_random_iter=True)
for i,j in data_iter:
    print(time_machine_corpus.to_token(i.detach().numpy().tolist()),'\n',time_machine_corpus.to_token(j.detach().numpy().tolist()))
    print()
    print(time_machine_corpus.to_token(i.T.detach().numpy().tolist()),'\n',time_machine_corpus.to_token(j.T.detach().numpy().tolist()))
    print(i.shape,'\n',j.shape)
    break

[['it', 'seemed', 'to', 'me', 'of', 'a', 'very', 'great', 'depth', 'one']] 
 [['seemed', 'to', 'me', 'of', 'a', 'very', 'great', 'depth', 'one', 'lay']]

[['it'], ['seemed'], ['to'], ['me'], ['of'], ['a'], ['very'], ['great'], ['depth'], ['one']] 
 [['seemed'], ['to'], ['me'], ['of'], ['a'], ['very'], ['great'], ['depth'], ['one'], ['lay']]
torch.Size([1, 10]) 
 torch.Size([1, 10])


In [19]:
def grad_clipping(net, theta):
    if isinstance(net, nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [20]:
class RNNModel(nn.Module):
    def __init__(self, rnn_layer, vocab_size, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.num_hiddens = self.rnn.hidden_size
        if not self.rnn.bidirectional:
            self.num_directions = 1
            self.linear = nn.Linear(self.num_hiddens, self.vocab_size)
        else:
            self.num_directions = 2
            self.linear = nn.Linear(self.num_hiddens * 2, self.vocab_size)

    def forward(self, inputs, state):
        X = F.one_hot(inputs.T.long(), self.vocab_size)
        X = X.to(torch.float32)
        Y, state = self.rnn(X, state)
        output = self.linear(Y.reshape((-1, Y.shape[-1])))
        return output, state

    def begin_state(self, device, batch_size=1):
        if not isinstance(self.rnn, nn.LSTM):
            # nn.GRU以张量作为隐状态
            return  torch.zeros((self.num_directions * self.rnn.num_layers,
                                 batch_size, self.num_hiddens),
                                device=device)
        else:
            # nn.LSTM以元组作为隐状态
            return (torch.zeros((
                self.num_directions * self.rnn.num_layers,
                batch_size, self.num_hiddens), device=device),
                    torch.zeros((
                        self.num_directions * self.rnn.num_layers,
                        batch_size, self.num_hiddens), device=device))
        
num_hiddens = 256
rnn_layer = nn.RNN(len(time_machine_corpus), num_hiddens)
rnn_net = RNNModel(rnn_layer=rnn_layer,vocab_size=len(time_machine_corpus))

In [21]:
def predict(prefix, num_preds, net, vocab, device):
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab.to_index(prefix[0])]
    get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1))
    for y in prefix[1:]:  # 预热期
        _, state = net(get_input(), state)
        outputs.append(vocab.to_index(y))
    for _ in range(num_preds):  # 预测num_preds步
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ' '.join([vocab.to_token(i) for i in outputs])


def train_epoch(net, train_iter, loss, updater, device, use_random_iter):
    state =  None
    for X, Y in train_iter:
        if state is None or use_random_iter:
            state = net.begin_state(batch_size=X.shape[0], device=device)
        else:
            if isinstance(net, nn.Module) and not isinstance(state, tuple):
                # state对于nn.GRU是个张量
                state.detach_()
            else:
                # state对于nn.LSTM或对于我们从零开始实现的模型是个张量
                for s in state:
                    s.detach_()
        y = Y.T.reshape(-1)
        X, y = X.to(device), y.to(device)
        y_hat, state = net(X, state)
        l = loss(y_hat, y.long()).mean()

        updater.zero_grad()
        l.backward()
        grad_clipping(net, 1)
        updater.step()


def trainer(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False):
    loss = nn.CrossEntropyLoss()
    updater = torch.optim.SGD(net.parameters(), lr)
    predict = lambda prefix: predict(prefix, 50, net, vocab, device)
    # 训练和预测
    for epoch in range(num_epochs):
        bar = tqdm(train_iter,total=len(train_iter))
        train_epoch(net, bar, loss, updater, device, use_random_iter)
        if (epoch + 1) % 10 == 0:
            print(predict(['time','traveller']))
    print(predict(['time','traveller']))
    print(predict(['traveller']))

In [22]:
trainer(rnn_net, data_iter,len(time_machine_corpus),0.01,100,'cpu',True)

  9%|▉         | 3308/36019 [00:34<05:43, 95.19it/s] 


KeyboardInterrupt: 