In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import json
import string
import time
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

In [2]:
# Seq2Seq model hyperparameter setting
SOS_token = 0
EOS_token = 1
vocab_num = 28
MAX_LENGTH = 21
plot_steps = 5

# Mode Setting
train_mode = True
display_mode = False
show_predict_text = False

data_root = Path('data')
model_root = Path('model')
image_root = Path('image')
log_root = Path('log')
model_name = "lstm_model-sgd"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [4]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [5]:
class Seq2Seq(nn.Module):
	def __init__(self, encoder, decoder, device):
		super().__init__()
		self.encoder = encoder
		self.decoder = decoder
		self.device = device
		
		assert encoder.hid_dim == decoder.hid_dim, \
			"Hidden dimensions of encoder and decoder must be equal!"
		assert encoder.n_layers == decoder.n_layers, \
			"Encoder and decoder must have equal number of layers!"
		
	def forward(self, src, trg, teacher_forcing_ratio = 0.5):
		batch_size = trg.shape[1]
		trg_len = trg.shape[0]
		trg_vocab_size = self.decoder.output_dim
		outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
		hidden, cell = self.encoder(src)
		input = trg[0,:]
		for t in range(1, trg_len):
			output, hidden, cell = self.decoder(input, hidden, cell)
			outputs[t] = output
			teacher_force = random.random() < teacher_forcing_ratio
			top1 = output.argmax(1) 
			if torch.equal(top1, torch.ones_like(top1)):
				break
			input = trg[t] if teacher_force else top1

		return outputs

In [6]:
# context switch process
def ids2text(index_list):
	embed = ["<SOS>", "<EOS>"] + list("abcdefghijklmnopqrstuvwxyz")
	text = ""
	for id in index_list:
		if id == EOS_token:
			break
		if id == SOS_token:
			continue
		text += embed[id]
	return text

def tensor2text(output, target, batch_size):
	pred = torch.swapaxes(output, 0, 1)
	trg = torch.swapaxes(target, 0, 1)
	pred_texts = []
	trg_texts = []

	for j in range(batch_size):
		_, topi = pred[j].topk(1)
		pred_text = ids2text(topi.squeeze())
		trg_text = ids2text(trg[j])
		pred_texts.append(pred_text)
		trg_texts.append(trg_text)

	return pred_texts, trg_texts

# My data loader function
def my_data_loader(data_file, batch_size=4):
	# character embedding
	embed = dict(zip(string.ascii_lowercase, range(2, 28)))

	# data read from json file
	with open(data_file) as f:
		data = json.load(f)

	# mapping text from a-z to 1-27, 0 for SOS, 28 for EOS, 28 for PAD shared with EOS
	input_tensors = []
	target_tensors = []
	batch_pairs = []
	for item in data:
		input_ids = []

		# {'train': [A, B], 'target': [a]} => [A, a], [B, a]
		target_ids = [SOS_token] + [embed[c] for c in item["target"]] + [EOS_token]*(MAX_LENGTH-len(item["target"])-1)
		target = torch.tensor(target_ids).view(21, 1)

		for text in item['input']: # multiple words
			input_ids= [SOS_token] + [embed[c] for c in text] + [EOS_token]*(MAX_LENGTH-len(text)-1)
			input_tensors.append(torch.tensor(input_ids).view(21, 1))
			target_tensors.append(target)

	for i in range(0, len(input_tensors)-batch_size, batch_size):
		if i+batch_size > len(input_tensors):
			break
		input_tensor = torch.cat(input_tensors[i:i+batch_size], dim=1)
		target_tensor = torch.cat(target_tensors[i:i+batch_size], dim=1)
		batch_pairs.append((input_tensor, target_tensor))

	return batch_pairs

# Visualize of training progress
def train_visualize(train, valid, datatype="Loss"):
	plt.plot(train, label='Train')
	plt.plot(valid, label='Valid')

	plt.xlabel('Epochs')
	plt.ylabel(datatype)
	plt.title(f'Seq2Seq training {datatype}')
	plt.legend()
	plt.savefig(image_root / f'seq2seq_{datatype}-{model_name}.jpg')
	plt.clf()
	if display_mode:
		plt.show()

#compute BLEU-4 score
def compute_bleu(output, reference):
    cc = SmoothingFunction()
    if len(reference) == 3:
        weights = (0.33,0.33,0.33)
    else:
        weights = (0.25,0.25,0.25,0.25)
    return sentence_bleu([reference], output,weights=weights,smoothing_function=cc.method1)

In [7]:
# Model configuration
INPUT_DIM = vocab_num
OUTPUT_DIM = vocab_num
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
BATCH_SIZE = 64
TEST_BATCH_SIZE = 1

# Model Training parameters
N_EPOCHS = 500
CLIP = 1

# Initialize model, encoder, and decoder
hidden_size = 256
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

# Define optimizer and criterion
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()

In [8]:
# Prepare data
train_iterator = my_data_loader(data_root / 'train.json', batch_size=BATCH_SIZE)
valid_iterator = my_data_loader(data_root / 'test.json', batch_size=TEST_BATCH_SIZE)
test_iterator = my_data_loader(data_root / 'new_test.json', batch_size=TEST_BATCH_SIZE)

In [9]:
# Train & Validate function
def train(model, iterator, optimizer, criterion, clip, batch_size):
    
	model.train()

	epoch_loss = 0
	epoch_bleus = 0

	for i, batch in enumerate(iterator):
		
		src = batch[0].to(device)
		trg = batch[1].to(device)
		
		optimizer.zero_grad()
		
		output = model(src, trg)
		
		#trg = [trg len, batch size]
		#output = [trg len, batch size, output dim]
		pred_text, trg_text = tensor2text(output, trg, batch_size)
		
		output_dim = output.shape[-1]
		
		output = output[1:].view(-1, output_dim)
		trg = trg[1:].view(-1)
		
		#trg = [(trg len - 1) * batch size]
		#output = [(trg len - 1) * batch size, output dim]
		

		loss = criterion(output, trg)
		
		loss.backward()
		
		torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
		
		optimizer.step()
		
		epoch_loss += loss.item()
		for p, t in zip(pred_text, trg_text):
			epoch_bleus += compute_bleu(p, t)
			
	return epoch_loss / len(iterator), epoch_bleus / (len(iterator) * batch_size)

def evaluate(model, iterator, criterion, batch_size, split='test', islog=False):

	model.eval()

	epoch_loss = 0
	epoch_bleus = 0
	acc = 0

	if islog:
		f = open(log_root / f'{split}-{model_name}_log.txt', 'w')

	with torch.no_grad():

		for i, batch in enumerate(iterator):

			src = batch[0].to(device)
			trg = batch[1].to(device)

			output = model(src, trg, 0) #turn off teacher forcing

			#trg = [trg len, batch size]
			#output = [trg len, batch size, output dim]
			pred_text, trg_text = tensor2text(output, trg, batch_size)
			pred_text, src_text = tensor2text(output, src, TEST_BATCH_SIZE)

			output_dim = output.shape[-1]
			
			output = output[1:].view(-1, output_dim)
			trg = trg[1:].view(-1)

			#trg = [(trg len - 1) * batch size]
			#output = [(trg len - 1) * batch size, output dim]

			loss = criterion(output, trg)
			
			epoch_loss += loss.item()
			for p, t, s in zip(pred_text, trg_text, src_text):
				if islog:
					f.write("="*20+"\n")
					f.write(f"input:  {s}\n")
					f.write(f"target: {t}\n")
					f.write(f"pred:   {p}\n")
				acc += compute_bleu(p, t) == 1
				epoch_bleus += compute_bleu(p, t)
	
	if islog:
		f.write(f"Bleu-4 score: {epoch_bleus / (len(iterator) * batch_size):4f}, Accuracy: {acc/(len(iterator) * batch_size):.4f}")
		f.close()
	return epoch_loss / len(iterator), epoch_bleus / (len(iterator) * batch_size), acc/(len(iterator) * batch_size)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [10]:
# Training
best_valid_loss = float('inf')

if train_mode:
	train_losses = []
	valid_losses = []
	train_bleus = []
	valid_bleus = []

	for epoch in range(N_EPOCHS):
		
		start_time = time.time()
		
		train_loss, train_bleu = train(model, train_iterator, optimizer, criterion, CLIP, BATCH_SIZE)
		valid_loss, valid_bleu, valid_acc = evaluate(model, valid_iterator, criterion, TEST_BATCH_SIZE)
		
		end_time = time.time()
		
		epoch_mins, epoch_secs = epoch_time(start_time, end_time)
		
		if valid_loss < best_valid_loss:
			best_valid_loss = valid_loss
			torch.save(model.state_dict(), model_root / f'{model_name}.pt')
		
		print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
		print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train Bleu: {train_bleu:.3f}')
		print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | Val. Bleu: {valid_bleu:.3f} | Accuracy: {valid_acc:.4f}')
		train_losses.append(train_loss)
		valid_losses.append(valid_loss)
		train_bleus.append(train_bleu)
		valid_bleus.append(valid_bleu)
    
		if epoch % plot_steps == 0:
			train_visualize(train_losses, valid_losses, datatype="Loss")
			train_visualize(train_bleus, valid_bleus, datatype="Bleu")
	
	train_visualize(train_losses, valid_losses, datatype="Loss")
	train_visualize(train_bleus, valid_bleus, datatype="Bleu")

Epoch: 01 | Time: 0m 4s
	Train Loss: 3.329 | Train PPL:  27.908 | Train Bleu: 0.002
	 Val. Loss: 3.313 |  Val. PPL:  27.461 | Val. Bleu: 0.031 | Accuracy: 0.0000
Epoch: 02 | Time: 0m 5s
	Train Loss: 3.261 | Train PPL:  26.087 | Train Bleu: 0.023
	 Val. Loss: 3.225 |  Val. PPL:  25.144 | Val. Bleu: 0.024 | Accuracy: 0.0000
Epoch: 03 | Time: 0m 5s
	Train Loss: 3.213 | Train PPL:  24.845 | Train Bleu: 0.026
	 Val. Loss: 3.176 |  Val. PPL:  23.948 | Val. Bleu: 0.023 | Accuracy: 0.0000
Epoch: 04 | Time: 0m 5s
	Train Loss: 3.197 | Train PPL:  24.457 | Train Bleu: 0.029
	 Val. Loss: 3.205 |  Val. PPL:  24.660 | Val. Bleu: 0.027 | Accuracy: 0.0000
Epoch: 05 | Time: 0m 5s
	Train Loss: 3.174 | Train PPL:  23.912 | Train Bleu: 0.031
	 Val. Loss: 3.174 |  Val. PPL:  23.894 | Val. Bleu: 0.026 | Accuracy: 0.0000
Epoch: 06 | Time: 0m 5s
	Train Loss: 3.160 | Train PPL:  23.577 | Train Bleu: 0.031
	 Val. Loss: 3.171 |  Val. PPL:  23.841 | Val. Bleu: 0.026 | Accuracy: 0.0000
Epoch: 07 | Time: 0m 5s
	Tra

<Figure size 640x480 with 0 Axes>

In [11]:
# Evaluate
# Load checkpoint
model.load_state_dict(torch.load(model_root / f'{model_name}.pt'))

test_loss, test_bleu, test_acc = evaluate(model, valid_iterator, criterion, TEST_BATCH_SIZE, split='test', islog=True)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test Bleu: {test_bleu:.3f} | Accuracy: {test_acc:.4f}')

new_test_loss, new_test_bleu, new_test_acc = evaluate(model, test_iterator, criterion, TEST_BATCH_SIZE, split='new_test', islog=True)

print(f'| New Test Loss: {new_test_loss:.3f} | New Test PPL: {math.exp(new_test_loss):7.3f} | New Test Bleu: {new_test_bleu:.3f} | Accuracy: {new_test_acc:.4f}')

| Test Loss: 1.900 | Test PPL:   6.686 | Test Bleu: 0.919 | Accuracy: 0.8776
| New Test Loss: 2.240 | New Test PPL:   9.394 | New Test Bleu: 0.642 | Accuracy: 0.4490


: 