# Set Parameter
- Copy Language
- Attention = True
- Teacher Forcing Ratio = 0.5
- Layer = 1
- Batch size = 128
- Learning rate = 0.001
- Hidden unit = 100
- Embedding size = 20
- Epochs = 100
- N = 50
- Data Length = 100K
- Alphabet size = 4
- Deduplication
- With position embedding 'length' & concat
- Use stack memory
- LSTM

In [None]:
import os
import argparse
import logging
import sys
import json
import numpy as np

import torch
from torch.optim.lr_scheduler import StepLR
import torchtext

os.chdir(os.path.dirname(os.path.abspath(os.path.dirname(os.path.abspath(os.path.dirname('__file__'))))))

from models.seq2seq import Seq2seq
from loss.loss import Perplexity
from evaluator.evaluator import Evaluator
from dataset import fields

import matplotlib.pyplot as plt

In [None]:
log_level = 'info'
LOG_FORMAT = '%(asctime)s %(levelname)-6s %(message)s'
logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, log_level.upper()))

In [None]:
f1_score_lists = []
data_name = "copy"
dir_name = "Ctype4_N50"
dev_name = "Ctype4_N100"
pretrained_dir_name = None
rnn = "lstm"
iterator = list(range(1,11,1))

data_path = "data/" + data_name + "/" + dir_name
dev_data_path = "data/" + data_name + "/" + dev_name
train_path = data_path+"/data_train.txt"
config_path = "models/config.json"

In [None]:
for i in iterator:
        print("RNN is %s" % rnn)
        print("data path: %s" % data_path)
        f1_score_list = []

        # Prepare dataset
        max_len = 105
        src = fields.SourceField()
        srcp = fields.SourceField()
        tgt = fields.TargetField()
        tgtp = fields.TargetField()
        def len_filter(example):
            return len(example.src) <= max_len and len(example.tgt) <= max_len
        train = torchtext.data.TabularDataset(
            path=train_path, format='tsv',
            fields=[('src', src), ('tgt', tgt)],
            filter_pred=len_filter
        )
        src.build_vocab(train)
        tgt.build_vocab(train)
        input_vocab = src.vocab
        output_vocab = tgt.vocab

        print("src vocab size = %d" % (len(src.vocab)))
        print("tat vacab size = %d" % (len(tgt.vocab)))

        # Prepare loss
        weight = torch.ones(len(tgt.vocab))
        pad = tgt.vocab.stoi[tgt.pad_token]
        loss = Perplexity(weight, pad)
        if torch.cuda.is_available():
            loss.cuda()

        # Model
        evaluator = Evaluator(loss=loss, batch_size=32)

        optimizer = "Adam"
        seq2seq = None
        config_json = open(config_path).read()
        config = json.loads(config_json)
        config["max_len"] = max_len
        config["hidden_size"] = 100
        config["rnn_cell"] = rnn
        config["embedding_size"] = 20
        config["use_attention"] = True
        config["position_embedding"] = "length"
        config["use_memory"] = "stack"
        config["pos_add"] = "cat"
        
        save_path = (data_name + dir_name
                        + ("_att" if config["use_attention"] else "")
                        + ("_with_pos_" + config["position_embedding"] if config["position_embedding"] is not None else "")
                        + ("_cat" if config["pos_add"] == "cat" else "")
                        + ("_use_stack" if config["use_memory"] == "stack" else "")
                        + ("_use_queue" if config["use_memory"] == "queue" else "")
                        + "_emb" + str(config["embedding_size"])
                        + "_hidden" + str(config["hidden_size"])
                        + ("_pretrained" if pretrained_dir_name is not None else ""))

        print(json.dumps(config, indent=4))

        if pretrained_dir_name is not None:
            pretrained_path = ("pretrained_weights/"+ data_name + pretrained_dir_name
                        + ("_att" if config["use_attention"] else "")
                        + ("_with_pos_" + config["position_embedding"] if config["position_embedding"] is not None else "")
                        + ("_cat" if config["pos_add"] == "cat" else "")
                        + ("_use_stack" if config["use_memory"] == "stack" else "")
                        + ("_use_queue" if config["use_memory"] == "queue" else "")
                        + "_emb" + str(config["embedding_size"])
                        + "_hidden" + str(config["hidden_size"])
                        + "_"+rnn+"_"+str(i))
            pretrained_pos_weight = np.load(pretrained_path+"/decoder_pos_weight.npy")
            seq2seq = Seq2seq(config, len(src.vocab), len(tgt.vocab), tgt.sos_id, tgt.eos_id,
                              pretrained_pos_weight)
        else :
            seq2seq = Seq2seq(config, len(src.vocab), len(tgt.vocab), tgt.sos_id, tgt.eos_id)

        if torch.cuda.is_available():
            seq2seq.cuda()

        for param in seq2seq.parameters():
            param.data.uniform_(-0.08, 0.08)

        lengths = list(range(2, 101, 2))
        print(lengths)
        for length in lengths:
            log_path = "log/pth/"+save_path +"_" + rnn + "_" + str(i) + "_model_save.pth"
            seq2seq.load_state_dict(torch.load(log_path))
            seq2seq.eval()
            dev_path = dev_data_path + "/dev_length/data_test_length_" + str(length) + ".txt"
            dev = torchtext.data.TabularDataset(
                    path=dev_path, format='tsv',
                    fields=[('src', src), ('tgt', tgt)],
                    filter_pred=len_filter
            )

            dev_loss, _, _, f1_score = evaluator.evaluate(seq2seq, dev)
            print("Length:%d, Dev Loss:%0.4f, F1 Score:%0.4f\n"
                        % (length, dev_loss, f1_score))

            f1_score_list.append(f1_score)

        f1_score_lists.append(f1_score_list)

In [None]:
f1_score_max = []
f1_score_min = []
f1_score_avg = []
for i in range(len(f1_score_lists[0])):
    temp = []
    for j in range(len(f1_score_lists)):
        temp.append(f1_score_lists[j][i])
    f1_score_max.append(max(temp))
    f1_score_min.append(min(temp))
    f1_score_avg.append(sum(temp,0.0)/len(temp))

fig_path = "log/eval/" + save_path
if not os.path.isdir(fig_path):
    os.mkdir(fig_path)
fig_path = fig_path + "/" + rnn
if not os.path.isdir(fig_path):
    os.mkdir(fig_path)

In [None]:
plt.figure(figsize=(15,7))
plt.bar(lengths, f1_score_max, LineWidth=2, label="Max", alpha=0.4)
plt.bar(lengths, f1_score_min, LineWidth=2, label="Min", alpha=0.4)
plt.plot(lengths, f1_score_avg, '-g', LineWidth=2, label="Avg")

plt.title(save_path + "_" + rnn, fontsize=24)
plt.legend(loc="best", fontsize=12)
plt.xlabel('Length', fontsize=24)
plt.ylabel('F1 Score', fontsize=24)
plt.ylim([0, 1.02])
plt.grid()
plt.savefig(fig_path + '/length_to_f1_score.png',format='png',bbox_inches='tight',dpi=300)

In [None]:
print(f1_score_avg)

In [None]:
print(sum(f1_score_avg,0.0)/len(f1_score_avg))