In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
PATH = "/content/drive/MyDrive/dataset/ch10/kftt-data-1.0/data/tok/"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


以下のファイルを利用します。

*kftt-data-1.0/data/tok/kyoto-train.cln.ja
*kftt-data-1.0/data/tok/kyoto-train.cln.en
*kftt-data-1.0/data/tok/kyoto-dev.ja
* kftt-data-1.0/data/tok/kyoto-dev.en
* kftt-data-1.0/data/tok/kyoto-test.ja
* kftt-data-1.0/data/tok/kyoto-test.en

In [None]:
import torch
import torch.nn as nn
import math
import re
import random
import time
import unicodedata
from torch import optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from io import open

In [None]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
        # SOS と EOS　を含めている

    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
    # spaceで文を分ける

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    # 辞書にない単語を入れて、ある単語をcount+1

In [None]:
def normalizeString(s):
    t = s
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    if len(s.replace(' ','')): # ascii文字以外
      return s
    return t
# 文を処理する

In [None]:
def readLangs(lang1, lang2):
    print("Reading lines...")
    with open(lang1, encoding='utf-8') as f:
        lines1 = f.readlines()
    with open(lang2, encoding='utf-8') as f:
        lines2 = f.readlines()
    # dataファイルを開ける
    pairs = []
    for l1, l2 in zip(lines1, lines2):
        l1 = normalizeString(l1.rstrip("\n"))
        l2 = normalizeString(l2.rstrip("\n"))
        pairs.append([l1, l2])
        # 二つの文を処理して、保存する

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 10
def filterPair(p):
    return len(p[0].split(" ")) < MAX_LENGTH and len(p[1].split(" ")) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
train_ja = PATH + "kyoto-train.cln.ja"
train_en = PATH + "kyoto-train.cln.en"

In [None]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs
    # output と　input　と　pairs　をもらう



input_lang, output_lang, pairs = prepareData(train_ja, train_en)
print(random.choice(pairs))

Reading lines...
Read 329882 sentence pairs
Trimmed to 67831 sentence pairs
Counting words...
Counted words:
/content/drive/MyDrive/dataset/ch10/kftt-data-1.0/data/tok/kyoto-train.cln.ja 34123
/content/drive/MyDrive/dataset/ch10/kftt-data-1.0/data/tok/kyoto-train.cln.en 32109
[' ic ic ', 'kumiyama minami interchange yawata higashi interchange ']


In [None]:
SOS_token = 0
EOS_token = 1
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(" ")]
    # 単語をindexへ変更する

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1)
    # indexをtensorへ変更する

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)
    # pairにある文をtensorへ変更する


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)
        # 過学習をふせぐ

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        # sinで　ぐうすうをencode
        # cosで きすうをencode
        self.register_buffer('pe', pe)
        # modelの定数として保存する

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        # 次元がheadsで割り切れるようにして、各headsの次元が整数になるようにする

        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        # attentionを調整する

    def forward(self, q, k, v, mask=None):
        N = q.shape[0]

        q_len = q.shape[1]
        k_len = k.shape[1]
        v_len = v.shape[1]

        Q = self.q_linear(q).view(N, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.k_linear(k).view(N, k_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.v_linear(v).view(N, v_len, self.num_heads, self.head_dim).transpose(1, 2)

        energy = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(energy, dim=-1)
        # attentionの重みを調整する
        x = torch.matmul(attention, V)
        # 重みを応用する
        x = x.transpose(1, 2).contiguous().view(N, q_len, self.num_heads * self.head_dim)
        x = self.fc_out(x)
        return x


In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.dropout(F.relu(self.linear1(x)))
        x = self.linear2(x)
        return x


In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()

        self.attention = MultiHeadAttention(d_model, num_heads)
        # attention layer

        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        # feedforward layer

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        # Normalization
        # 訓練を加速するため

        self.dropout = nn.Dropout(dropout)
        # dropout
        # 過学習をふせぐ


    def forward(self, x, mask):
        attn_output = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()

        self.self_attention = MultiHeadAttention(d_model, num_heads)
        # self attention layer

        self.encoder_attention = MultiHeadAttention(d_model, num_heads)
        # encoder attention layer

        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        # feed forward layer

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        # Normalization

        self.dropout = nn.Dropout(dropout)
        # dropout

    def forward(self, x, enc_output, src_mask, tgt_mask):
        self_attn_output = self.self_attention(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_output))
        enc_attn_output = self.encoder_attention(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(enc_attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, d_model, num_heads, d_ff, num_layers, dropout, max_length):
        super(Encoder, self).__init__()

        self.embedding = nn.Embedding(input_dim, d_model)
        # indexをembeddingする

        self.positional_encoding = PositionalEncoding(d_model, max_length)
        # positionの情報を入力する

        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        # encoderlayerを設置する

        self.dropout = nn.Dropout(dropout)
        # dropout

    def forward(self, src, mask):
        x = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        x = self.positional_encoding(x)
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x

class Decoder(nn.Module):
    def __init__(self, output_dim, d_model, num_heads, d_ff, num_layers, dropout, max_length):
        super(Decoder, self).__init__()

        self.embedding = nn.Embedding(output_dim, d_model)
        # 出力したindexをembeddingする

        self.positional_encoding = PositionalEncoding(d_model, max_length)
        # positionの情報を入力する

        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        # decoderlayerを設置する

        self.dropout = nn.Dropout(dropout)
        # dropout

    def forward(self, tgt, enc_output, src_mask, tgt_mask):
        x = self.embedding(tgt) * math.sqrt(self.embedding.embedding_dim)
        x = self.positional_encoding(x)
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8, d_ff=2048, num_layers=6, dropout=0.1, max_length=MAX_LENGTH):
        super(Transformer, self).__init__()

        self.encoder = Encoder(src_vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_length)
        self.decoder = Decoder(tgt_vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_length)
        # encoder と decoderを作成

        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def make_src_mask(self, src):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(1)
        return src_mask

    def make_tgt_mask(self, tgt):
        tgt_len = tgt.size(1)
        tgt_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        return tgt_mask
    # 生成中にモデルが未来の単語を見るのを防ぐため、ターゲット配列のマスクを作成する


    def forward(self, src, tgt):
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        output = self.fc_out(dec_output)
        return output



In [None]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# プログラムの実行の時間を表示するため


In [None]:
# modelを訓練する
def train(input_tensor, target_tensor, model, optimizer, criterion, max_length=MAX_LENGTH):

    model.train()
    optimizer.zero_grad()

    src = input_tensor.unsqueeze(0)
    tgt = target_tensor.unsqueeze(0)[:, :-1]

    output = model(src, tgt)
    output_dim = output.shape[-1]

    output = output.contiguous().view(-1, output_dim)
    target = target_tensor.unsqueeze(0)[:, 1:].contiguous().view(-1)

    loss = criterion(output, target)
    loss.backward()

    optimizer.step()

    return loss.item()

# epochによって訓練する
def trainIters(model, n_iters, print_every=1000, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    training_pairs = [tensorsFromPair(random.choice(pairs)) for _ in range(n_iters)]

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, model, optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(
                "%s (%d %d%%) %.4f"
                % (
                    timeSince(start, iter / n_iters),
                    iter,
                    iter / n_iters * 100,
                    print_loss_avg,
                )
            )
# modelを評価する
def evaluate(model, sentence, max_length=MAX_LENGTH):
    model.eval()
    with torch.no_grad():
      input_tensor = tensorFromSentence(input_lang, sentence).to(device)
      src = input_tensor.unsqueeze(0).to(device)
      tgt = torch.tensor([[SOS_token]], dtype=torch.long, device=device)

      decoded_words = []
      for i in range(max_length):
        output = model(src, tgt).to(device)
        topv, topi = output[0, -1, :].topk(1)
        topi = topi.to(device)

        if topi.item() == EOS_token:
            decoded_words.append("<EOS>")
            break
        else:
            decoded_words.append(output_lang.index2word[topi.item()])

        tgt = torch.cat((tgt, topi.unsqueeze(0).to(device)), dim=1).to(device)

      return decoded_words

# ランダムに文を生成する
def evaluateRandomly(model, n=10):
    model.to(device)
    for i in range(n):
      pair = random.choice(pairs)
      print(">", pair[0])
      print("=", pair[1])
      output_words = evaluate(model, pair[0])
      output_sentence = " ".join(output_words)
      print("<", output_sentence)
      print("")

d_model = 256
num_heads = 8
d_ff = 512
num_layers = 3

model = Transformer(input_lang.n_words, output_lang.n_words, d_model, num_heads, d_ff, num_layers).to(device)

In [None]:
trainIters(model, 75000, print_every=5000)

49m 53s (- 698m 25s) (5000 6%) 6.2507
104m 14s (- 677m 34s) (10000 13%) 6.3202
163m 58s (- 655m 55s) (15000 20%) 6.3563
228m 54s (- 629m 30s) (20000 26%) 6.4587
301m 11s (- 602m 22s) (25000 33%) 6.5397
383m 31s (- 575m 17s) (30000 40%) 6.6172
478m 2s (- 546m 19s) (35000 46%) 6.6118
577m 11s (- 505m 2s) (40000 53%) 6.7034
679m 54s (- 453m 16s) (45000 60%) 6.7838
786m 44s (- 393m 22s) (50000 66%) 6.8261
901m 40s (- 327m 52s) (55000 73%) 6.8410
1018m 49s (- 254m 42s) (60000 80%) 6.8720
1139m 4s (- 175m 14s) (65000 86%) 6.8516
1262m 9s (- 90m 9s) (70000 93%) 6.9290
1389m 47s (- 0m 0s) (75000 100%) 6.9521


In [None]:
torch.save(model, PATH + 'model.pth')

In [None]:
model = torch.load(PATH + 'model.pth')

In [None]:
evaluateRandomly(model, 10)


> ( その ) こと を 最初 と する 。
= this is the first stage .
< ikagu dotaku kumiawasedate divergent trap persona histories spared trains toda

> 盆地 ： 近江 盆地
= basin omi basin
< category kiccho 1098 koshiore fujimaru shukugawa yazu 1972 exile kichibe

> 情報 メディア 教育 研究 部門
= research division for information and media study design
< ikagu dotaku kumiawasedate divergent trap retort my seifu junkei keina

> 法然 寺 ( 京都 市 )
= honen ji temple kyoto city 
< ikagu dotaku kumiawasedate morohide cymbals animated asashiro combining signalling armor

> トロ ： マグロ の 腹身 。
= toro fatty portion of tuna belly
< ikagu dotaku kumiawasedate morohide cymbals animated kotetsukan kokuon inquiries 616-8511

> たまり （ 溜り ）
= tamari soy sauce
< ikagu gansen utaimono hano gorman sei shakuhachi passions gijo dining

> 骨 法術
= koppo jutsu a martial art with bare hands 
< motomitsu jochu jochu jochu jochu jochu jochu jochu jochu jochu

> 2 代 目 河原崎 長十郎
= chojuro kawarasaki the second
< ikagu dotaku oshiraishimochi katsushika isshi pas

In [None]:
!sudo apt-get install mecab libmecab-dev mecab-ipadic-utf8
!pip install mecab-python3

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libmecab-dev is already the newest version (0.996-14build9).
mecab-ipadic-utf8 is already the newest version (2.7.0-20070801+main-3).
mecab is already the newest version (0.996-14build9).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
!mecab -v

mecab of 0.996



In [None]:
!sudo find / -name mecabrc

/etc/mecabrc
find: ‘/proc/73/task/73/net’: Invalid argument
find: ‘/proc/73/net’: Invalid argument


In [None]:
import MeCab

mecabrc_path = "/etc/mecabrc"


def translate(s):
    mecab = MeCab.Tagger(f"-r {mecabrc_path}")
    seq = mecab.parse(s).rstrip()
    seq = seq.split()
    print(seq)
    seq = [word for word in seq if word in input_lang.word2index]
    print(seq)
    seq = ' '.join(seq)
    print(seq)
    output_words = evaluate(model, seq)
    print(output_words)
    return ' '.join(output_words)

s = '隣の客はよく柿食う客だ'
print (translate(s))

['隣', '名詞,一般,*,*,*,*,隣,トナリ,トナリ', 'の', '助詞,連体化,*,*,*,*,の,ノ,ノ', '客', '名詞,一般,*,*,*,*,客,キャク,キャク', 'は', '助詞,係助詞,*,*,*,*,は,ハ,ワ', 'よく', '副詞,一般,*,*,*,*,よく,ヨク,ヨク', '柿', '名詞,一般,*,*,*,*,柿,カキ,カキ', '食う', '動詞,自立,*,*,五段・ワ行促音便,基本形,食う,クウ,クウ', '客', '名詞,一般,*,*,*,*,客,キャク,キャク', 'だ', '助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ', 'EOS']
['隣', 'の', '客', 'は', 'よく', '柿', '客', 'だ']
隣 の 客 は よく 柿 客 だ
['koretada', 'toteki', 'hiromogi', 'aspect', 'nakaraibon', 'daitenpaku', 'sanemitsu', 'kuroki', 'indo', 'shaped']
koretada toteki hiromogi aspect nakaraibon daitenpaku sanemitsu kuroki indo shaped


In [None]:
!pip install nltk



In [None]:
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# 确保NLTK的punkt数据包已下载
nltk.download('punkt')

# 评估函数
def evaluate_bleu(model, pairs, n=10):
    bleu_scores = []
    model.to(device)
    for i in range(n):
        pair = random.choice(pairs)
        print(pair)
        print(">", pair[0])
        print("=", pair[1])
        output_words = evaluate(model, pair[0])
        output_sentence = " ".join(output_words)
        print("<", output_sentence)
        print("")

        reference = [pair[1].split()]  # 参考翻译
        # print(reference)
        candidate = output_sentence.split()  # 模型生成的翻译
        # print(candidate)
        smoothie = SmoothingFunction().method4
        bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
        bleu_scores.append(bleu_score)

    average_bleu = sum(bleu_scores) / len(bleu_scores)
    return average_bleu

# 运行评估
average_bleu_score = evaluate_bleu(model, pairs, n=100)
print("Average BLEU score:", average_bleu_score)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['経歴', 'biography']
> 経歴
= biography
< motomitsu jochu jochu jochu jochu inside eiyosho konya cessation evolved

['神智 学 の 影響 が かなり 強 い 。', 'it is heavily affected by theosophy .']
> 神智 学 の 影響 が かなり 強 い 。
= it is heavily affected by theosophy .
< ikagu dotaku kumiawasedate divergent trap retort tekijuku sukenao ikiryo jamisen

['明治 2 年 （ 1869 年 ）', '1869']
> 明治 2 年 （ 1869 年 ）
= 1869
< ikagu dotaku oshiraishimochi katsushika fukasu ripened kosaibari aoyama namegata mitsunaga

['栃木 県', 'tochigi prefecture']
> 栃木 県
= tochigi prefecture
< koyomaro binding takino dogugura seijozan yoraku sedentary hokkesojiin seminary yamate

['さらに 、 8 月 に 北越 戦争 に 従軍', 'he also served in hokuetsu war in august .']
> さらに 、 8 月 に 北越 戦争 に 従軍
= he also served in hokuetsu war in august .
< ikagu dotaku kumiawasedate divergent trap retort my imitates kamiza yasuomaru

['承平 六 年 （ 936 年 ）', 'in ']
> 承平 六 年 （ 936 年 ）
= in 
< ikagu dotaku oshiraishimochi katsushika fukasu imukashiki gulf defined tebamoto ware

['野宮 神社