In [1]:
print('hello world')

hello world


In [2]:
from google.colab import drive
drive.mount('/content/dirve')

Drive already mounted at /content/dirve; to attempt to forcibly remount, call drive.mount("/content/dirve", force_remount=True).


In [3]:
!pip install torchtext==0.6.0



In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.data import Field, BucketIterator

import numpy as np

import random
import math
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
!pip install janome



In [7]:
import janome
from janome.tokenizer import Tokenizer
j_t = Tokenizer()

def tokenizer(text):
  return [tok for tok in j_t.tokenize(text, wakati=True)]

In [8]:
tokenizer("今日は曇りです")

['今日', 'は', '曇り', 'です']

In [9]:
import torchtext
import torch
from torchtext import data
from torchtext import datasets

In [10]:
SRC = data.Field(sequential=True, tokenize = tokenizer, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = data.Field(sequential=True, tokenize = tokenizer, init_token='<sos>', eos_token='<eos>', lower=True)

In [None]:
def choose_dataset(flag = False):
  if flag:
    return data.TabularDataset.splits(
        path="/content/dirve/My Drive/Colab Notebooks/data/", train='one_train.tsv',
        validation='one_val.tsv', test='one_test.tsv', format='tsv',
        fields=[('SRC', SRC), ('TRG', SRC)])
  else:
    return data.TabularDataset.splits(
        path="/content/dirve/My Drive/Colab Notebooks/data/", train='train.tsv',
        validation='val.tsv', test='test.tsv', format='tsv',
        fields=[('SRC', SRC), ('TRG', SRC)])

In [11]:
train, val, test = choose_dataset(False)

In [12]:
SRC.build_vocab(train)
#TRG.build_vocab(train)

In [13]:
train_batch_size = 50
test_batch_size = 32
eval_batch_size = 2
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), sort = False,  batch_sizes = (train_batch_size,eval_batch_size, test_batch_size), device= device)

In [14]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.hid_dim = hid_dim
    self.n_layers = n_layers
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    return hidden, cell

In [15]:
class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.output_dim = output_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(output_dim, emb_dim)

    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

    self.fc_out = nn.Linear(hid_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, input, hidden, cell):
    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
    prediction = self.fc_out(output.squeeze(0))

    return prediction, hidden, cell

In [16]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self, src, trg, teacher_forcing_ratio = 0.5):
    batch_size = trg.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim

    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    hidden, cell = self.encoder(src)

    output = trg[0,:]

    for t in range(1, trg_len):
      output, hidden, cell = self.decoder(output, hidden, cell)

      outputs[t] = output
      teacher_force = random.random() < teacher_forcing_ratio
      top1 = output.argmax(1)
      output = (trg[t] if teacher_force else top1)

    return outputs

In [17]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(SRC.vocab)
ENC_EMB_DIM = 768
DEC_EMB_DIM = 768
ENC_HID_DIM = 768
DEC_HID_DIM = 768
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [18]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4336, 768)
    (rnn): LSTM(768, 768, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(4336, 768)
    (rnn): LSTM(768, 768, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=768, out_features=4336, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [19]:
optimizer = optim.Adam(model.parameters())

In [20]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = SRC_PAD_IDX)

In [21]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()

  epoch_loss = 0

  for i, batch in enumerate(iterator):

    src = batch.SRC
    trg = batch.TRG
    optimizer.zero_grad()

    output = model(src, trg)

    #print("output size:", output.size())
    #print("target size:", trg.size())
    output_dim = output.shape[-1]
    output = output[:].view(-1, output_dim)
    trg = trg[:].view(-1)
    loss = criterion(output, trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm(model.parameters(), clip)
    optimizer.step()

    epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [22]:
def evaluate(model, iterator, criterion):
  model.eval()

  epoch_loss = 0

  with torch.no_grad():

    for i, batch in enumerate(iterator):

      src = batch.SRC
      trg = batch.TRG

      output = model(src, trg)

      output_dim = output.shape[-1]

      output = output[:].view(-1, output_dim)
      trg = trg[:].view(-1)

      loss = criterion(output, trg)
      epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [23]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins*60))
  return elapsed_mins, elapsed_secs

In [24]:
N_EPOCHS = 100
CLIP = 1
best_model = None

def  model_train(epochs, clip):
  best_valid_loss = float('inf')

  for epoch in range(epochs):
      
      start_time = time.time()
      
      train_loss = train(model, train_iter, optimizer, criterion, clip)
      valid_loss = evaluate(model, val_iter, criterion)
      
      end_time = time.time()
      
      epoch_mins, epoch_secs = epoch_time(start_time, end_time)
      
      if valid_loss < best_valid_loss:
          best_valid_loss = valid_loss
          best_model = model
          #torch.save(model.state_dict(), 'tut1-model.pt')
      
      print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
      print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
      print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
model.apply(init_weights)
model_train(100, 1)



Epoch: 01 | Time: 0m 38s
	Train Loss: 4.402 | Train PPL:  81.646
	 Val. Loss: 4.218 |  Val. PPL:  67.922
Epoch: 02 | Time: 0m 38s
	Train Loss: 4.108 | Train PPL:  60.819
	 Val. Loss: 4.160 |  Val. PPL:  64.046
Epoch: 03 | Time: 0m 39s
	Train Loss: 4.038 | Train PPL:  56.699
	 Val. Loss: 4.161 |  Val. PPL:  64.120
Epoch: 04 | Time: 0m 39s
	Train Loss: 3.983 | Train PPL:  53.654
	 Val. Loss: 4.154 |  Val. PPL:  63.669
Epoch: 05 | Time: 0m 39s
	Train Loss: 3.946 | Train PPL:  51.741
	 Val. Loss: 4.140 |  Val. PPL:  62.776
Epoch: 06 | Time: 0m 39s
	Train Loss: 3.901 | Train PPL:  49.444
	 Val. Loss: 4.164 |  Val. PPL:  64.302
Epoch: 07 | Time: 0m 39s
	Train Loss: 3.921 | Train PPL:  50.454
	 Val. Loss: 4.109 |  Val. PPL:  60.869
Epoch: 08 | Time: 0m 39s
	Train Loss: 3.879 | Train PPL:  48.374
	 Val. Loss: 4.193 |  Val. PPL:  66.246
Epoch: 09 | Time: 0m 39s
	Train Loss: 3.869 | Train PPL:  47.893
	 Val. Loss: 4.143 |  Val. PPL:  62.964
Epoch: 10 | Time: 0m 39s
	Train Loss: 3.831 | Train PPL

In [None]:
torch.save(best_model.state_dict(), '/content/dirve/My Drive/Colab Notebooks/model/seq2seq.pth')

In [None]:
model.state_dict(torch.load("/content/dirve/My Drive/Colab Notebooks/model/seq2seq.pth"))

In [None]:
def gen_sentence(sentence, src_field, trg_field, model, batch_size):
  model.eval()
  in_str, out_str, pred, tmp = [], [], [], []
  length = len(sentence)

  with torch.no_grad():
    for _, batch in enumerate(sentence):
      src = batch.SRC
      trg = batch.TRG

      output = model(src, trg)
          
      for j in range(min(length, batch_size)):
        topv, topi = output.data.topk(1)
        _, topi_s = output.data.topk(2) 
        for k in range(topi.size()[1]):
          if topi[1:, k][0] == trg_field.vocab.stoi["<eos>"]:
            for i in range(topi_s.size()[0]):
              for l in range(topi_s.size()[1]):
                topi[i][l][0] = topi_s[i][l][1]
          for i in range(topi.size()[0] - 1):
            if src_field.vocab.itos[topi[1:, k][i]] == "<eos>":
              break
            tmp.append(src_field.vocab.itos[topi[1:, k][i]])
          pred.append(tmp)
          tmp = []
        in_str.append([src_field.vocab.itos[i.item()] for i in src[:,j] if src_field.vocab.itos[i.item()] != "<eos>"])
        out_str.append([trg_field.vocab.itos[i.item()] for i in trg[:,j] if trg_field.vocab.itos[i.item()] != "<eos>"])
      
  return in_str, out_str, pred

In [None]:
#中間発表時にはテストデータは用いない
test_in, test_out, test_pred = [],[],[]
test_in, test_out, test_pred = gen_sentence(test_iter, SRC, SRC, model, test_batch_size)
val_in, val_out, val_pred = [],[],[]
val_in, val_out, val_pred = gen_sentence(val_iter, SRC, SRC, model, eval_batch_size)
train_in, train_out, train_pred = [],[],[]
train_in, train_out, train_pred = gen_sentence(train_iter, SRC, SRC, model, train_batch_size)

In [None]:
import pandas as pd

In [None]:
def convert_list_to_df(in_list, out_list, pred_list):
  row = []
  for i in range(len(in_list)):
    batch_input = in_list[i]
    batch_output = out_list[i]
    batch_pred = pred_list[i]
    input = [j for j in batch_input if j != "<pad>" and j != "<sos>" and j != "<eos>" and j != "<unk>"]
    output = [j for j in batch_output if j != "<pad>" and j != "<sos>" and j != "<eos>" and j != "<unk>"]
    predict = [j for j in batch_pred if j != "<pad>" and j != "<sos>" and j != "<eos>" and j != "<unk>"]
    input_str = "".join(input)
    output_str ="".join(output)
    predict_str = "".join(predict)
    row.append([input_str, output_str, predict_str])

  df = pd.DataFrame(row, columns=["input","answer","predict"])
  df = df.sort_values('input')
  return df

In [None]:
train_df = convert_list_to_df(train_in, train_out, train_pred)
val_df = convert_list_to_df(val_in, val_out, val_pred)
test_df = convert_list_to_df(test_in, test_out, test_pred)

In [None]:
train_df

In [None]:
df_s = pd.concat([train_df, test_df]).sort_values('input')

In [None]:
df_s

In [None]:
df_s.to_csv("/content/dirve/My Drive/Colab Notebooks/result_seq2seq.csv")