In [1]:
import string


# ascii文字で辞書を作る
# string.printable => '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
all_chars = string.printable
vocab_size = len(all_chars)
vocab_dict = dict((c, i) for (i, c) in enumerate(all_chars))

# 文字列を数値のリストに変換する関数
def str2ints(s, vocab_dict):
    return [vocab_dict[c] for c in s]

# 数値のリストを文字列に変換する関数
def ints2str(x, vocab_array):
    return "".join([vocab_array[i] for i in x])

In [2]:
from torch.utils.data import Dataset


# curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > tinyshakespeare.txt
class ShakespeareDataset(Dataset):
    def __init__(self, path, chunk_size=200):
        # ファイルを読み込み、数値のリストに変換する
        data = str2ints(open(path).read().strip(), vocab_dict)
        # LongTensorに変換し、splitする
        data = torch.LongTensor(data).split(chunk_size)
        # 最後のchunkの長さをチェックして足りない場所には捨てる
        if len(data[-1]) < chunk_size:
            data = data[:-1]
        self.data = data
        self.n_chunks = len(self.data)
        
    def __len__(self):
        return self.n_chunks
    
    def __getitem__(self, idx):
        return self.data[idx]

In [3]:
import torch
from torch.utils.data import DataLoader


ds = ShakespeareDataset('./tinyshakespeare.txt', chunk_size=200)
loader = DataLoader(ds, batch_size=32, shuffle=True, num_workers=4)

In [4]:
from torch import nn


class SequenceGenerationNet(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        # Linearのoutputサイズは最初のEmbeddingのinputサイズと同じnum_embeddings
        self.linear = nn.Linear(hidden_size, num_embeddings)
        
    def forward(self, x, h0=None):
        x = self.emb(x)
        x, h = self.lstm(x, h0)
        x = self.linear(x)
        return x, h

In [5]:
def generate_seq(net, start_phrase='The King said', length=200, temperature=0.8):
    # モデルを評価モードにする
    net.eval()
    # 出力の数値を収納するリスト
    result = []
    # 開始文字列をTensorに変換
    start_tensor = torch.LongTensor(str2ints(start_phrase, vocab_dict))
    # 先頭にbatch次元を付けてVariableにする
    x0 = V(start_tensor.unsqueeze(0), volatile=True)
    # RNNに通して出力と新しい内部状態を得る
    o, h = net(x0)
    # 出力を(正規化されてない)確率に変換
    out_dist = o[:, -1].data.view(-1).exp()
    # 確率から実際の文字のインデクスをサンプリング
    top_i = torch.multinomial(out_dist, 1)[0]
    # 結果を保存
    result.append(top_i)
    # 生成された結果を次々にRNNに入力していく
    for i in range(length):
        inp = torch.LongTensor([[top_i]])
        o, h = net(V(inp), h)
        out_dist = o.data.view(-1).exp()
        top_i = torch.multinomial(out_dist, 1)[0]
        result.append(top_i)
    # 開始文字列と生成された文字列をまとめて返す
    return start_phrase + ints2str(result, all_chars)

In [6]:
from torch.autograd import Variable as V
from statistics import mean
from torch import optim


net = SequenceGenerationNet(vocab_size, 20, 50, num_layers=2, dropout=0.1)
opt = optim.Adam(net.parameters())
# 多クラス分類なので損失関数はソフトマックスクロスエントロピー
loss_f = nn.CrossEntropyLoss()
for epoch in range(50):
    net.train()
    losses = []
    for data in loader:
        # xは初めから最後の手前の文字列まで
        x = V(data[:, :-1])
        # yは2文字目から最後の文字まで
        y = V(data[:, 1:])
        y_pred, _ = net(x)
        # batchとstepの軸を統合してからCrossEntropyLossに渡す
        # https://discuss.pytorch.org/t/runtimeerror-input-is-not-contiguous/930/8
        loss = loss_f(y_pred.view(-1, vocab_size), y.contiguous().view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.data[0])
    # 現在の損失関数と生成される文章の例を表示
    print('===================================================================================')
    print(epoch, mean(losses))
    print(generate_seq(net))

0 3.4555294377463204
The King said tesidmD mh aror a asheepkfdahbWahsti  eguoos rirhn oil
bH cown ao
e,hi :eowsimohno a,i ehrmon  tisis.a talea swhhh ,oalu hhtOsYb ealr uee eactefas iedahea htk tag
Tsoe.IlIsmIlahsslnd
sGe.g

AEGcrTg
mt
1 2.9101853397914343
The King said khs tulos

Wefenmv rrasrurh ;orl sou dtd tell bae lnseeE taesr r othn'eefloaou tfaoks hlcd'hehy ou saeeriay aes;l nhlg s,sd  Bgat,:
Hhege
Edug svhhs aer winu yn ite,
S ehidr atrian; anrtsy ?ik hse, si
2 2.6435624408721923
The King saide wecny nooc lhgut modw;
The srevegaaddd thire byd eese had upir tc anet karertsltliul thsn fmclyglf wfhitsiy ir foshad sss on?

DEEEUGANO:
woid tram'ey muith tar ofe or'd evgfin tethR
or he yihe tr so
3 2.446467430932181
The King said luen:
Fr yraupbegr
Nh he'e thlal'e
Whoud
Hy be whaulwrddgrong coutem ser'le nuoums.

OEUIY-NS:
Vferepy thit sereun soun syout ordes bheund aunge thou to arsmerad, moy by sout yer ape thwle, I
Whentinl
4 2.332917310169765
The King said thepef ltore, hoe I der