# Download and Install Required Dependencies

In [1]:
!pip install --upgrade matplotlib rouge spacy
!python3 -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting spacy
  Downloading spacy-3.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m91.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rouge, spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.5.2
    Uninstalling spacy-3.5.2:
      Successfully uninstalled spacy-3.5.2
Successfully installed rouge-1.0.1 spacy-3.5.3
2023-06-03 06:53:20.693491: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-03 06:53:20.746626: I tensorflow/core/platform/cpu_feature_gua

# Import Required Dependencies

In [2]:
import math
import time
import spacy
import sys
import os
import random
import pandas as pd
from tqdm import tqdm
from collections import Counter

import matplotlib.pyplot as plt

if sys.version_info[0] < 3:
    from StringIO import StringIO
else:
    from io import StringIO

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Mount Google Drive and Set Working Directory

In [3]:
from google.colab import drive
drive.mount('/content/drive')

project_folder = 'hw3'
drive_path = '/content/drive/MyDrive'

os.chdir(os.path.join(drive_path, project_folder))

Mounted at /content/drive


# Set Random Seed for Reproducible Results

In [4]:
SEED = 32
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Set GPU

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using {device}")

using cuda


# Set Hyper-Parameters and Config

In [6]:
START = "<START>"
STOP = "<STOP>"
PADDING = "<PAD>"
UNK_TOKEN = "<UNK>"
DEVICE = device
TRUNCATE_SIZE = 512
MAX_VOCAB_SIZE = 512
LEAST_FREQ = 0
BATCH_SIZE = 8
ENC_EMB_DIM = 64
DEC_EMB_DIM = 64
ENC_HID_DIM = 128
DEC_HID_DIM = 128
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
IMPROVEMENT = False
if IMPROVEMENT:
  NAME = "improvement"
else:
  NAME = "baseline"
LINE_CONSTRAINT = 50000
spacy_en = spacy.load("en_core_web_sm")

# Helper Functions

In [7]:
def display_stage(stage_title: str):
    """Display stage title with padding."""
    total_length = 100
    title_length = len(stage_title)
    pad_count = int((total_length - title_length) / 2)
    padding = "*" * pad_count
    formatted_title = f"{padding}{stage_title}{padding}"
    print(formatted_title)


def epoch_time(start_time: float, 
               end_time: float
            ):
    """Calculate the time in minutes and seconds for each epoch."""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def read_files(file_name, lines_constraint=None):
    results = list()
    with open(file_name, encoding="utf-8") as f:
        count = 0
        for line in f:
            results.append(line.replace("\n", ""))
            if lines_constraint:
                count += 1
                if count >= lines_constraint:
                    break
    return results


def write_predictions(preds, split, name):
    with open(f"./{split}_summaries.txt", "w") as f:
        f.write("\n".join(preds))

# Set FilePaths and Load Data

In [8]:
SRC_FILENAME = {
    "train": "./data/train.txt.src",
    "train_truncate": "./data/truncated_train.txt.src",
    "dev": "./data/val.txt.src",
    "test": "./data/test.txt.src",
}
TGT_FILENAME = {
    "train": "./data/train.txt.tgt",
    "dev": "./data/val.txt.tgt",
    "test": "./data/test.txt.tgt",
}

def add_special_tokens(tokens):
    return [START] + tokens.split() + [STOP]


def build_data_points(data_df):
    return data_df.apply(lambda row: (add_special_tokens(row["text"]), add_special_tokens(row["summary"])), axis=1)

def load_data(split, line_constraint=None, truncate=False):
    inputs = read_files(
        SRC_FILENAME[f"{split}_truncate"]
        if truncate and split == "train"
        else SRC_FILENAME[split],
        line_constraint,
    )
    outputs = read_files(TGT_FILENAME[split], line_constraint)
    df = pd.DataFrame({"text": inputs, "summary": outputs})
    max_len_summary = max(
        outputs, key=lambda summary: len(summary.split())
    ).split()
    max_len = len(max_len_summary)
    data_point_df = build_data_points(df)
    return data_point_df, max_len

# CNN Dataset

In [9]:
class CNNDataset(Dataset):
    def __init__(self, data):
        self.X = list()
        self.y = list()
        self.raw_y = list()
        for text, summary in data:
            if len(text) > 0:
                self.X.append(text)
                self.y.append(summary)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        return self.X[index], self.y[index], index


def reverse_map(_map):
    reversed_map = {}
    for key, val in _map.items():
        reversed_map[val] = key
    return reversed_map

# Create Vocabulary

In [10]:
class Vocab:
    def __init__(self, tokens, base_map={}, max_size=None, least_freq=0):
        self.token2idx = base_map
        self.freq = Counter(
            [token for sequence in tokens for token in sequence]
        )

        vocab_size = 0
        # store the token start from higher frequency
        for word, count in sorted(
            self.freq.items(), key=lambda item: item[1], reverse=True
        ):
            if count < least_freq:
                break
            if max_size is not None and vocab_size > max_size:
                break
            self.insert(word)
            vocab_size += 1

        self.idx2token = reverse_map(self.token2idx)

    def insert(self, token):
        if token in self.token2idx.keys():
            return
        self.token2idx[token] = len(self.token2idx)

    def lookup_index(self, word):
        if word not in self.token2idx.keys():
            word = UNK_TOKEN
        return self.token2idx[word]

    def lookup_token(self, idx):
        return self.idx2token[idx]

    def __len__(self):
        return len(self.token2idx)

    def __repr__(self):
        return str(self.token2idx)

# Load Data and Create DataLoaders

In [11]:
display_stage("Load Training Data")
train_data, _ = load_data("train", LINE_CONSTRAINT)[:]
display_stage("Load Dev Data")
dev_data, dev_summary_max_len = load_data("dev")[:]
display_stage("Load Test Data")
test_data, test_summary_max_len = load_data("test")[:]

summary_max_len = max(dev_summary_max_len, test_summary_max_len)

train_set = CNNDataset(train_data)
dev_set = CNNDataset(dev_data)
test_set = CNNDataset(test_data)

display_stage("Building Vocab")
word_vocab = Vocab(
    train_set.X + train_set.y,
    base_map={PADDING: 0, UNK_TOKEN: 1},
    max_size=MAX_VOCAB_SIZE,
    least_freq=LEAST_FREQ,
)

print("Vocab Length=", len(word_vocab))


def text_pipeline(sentence, truncate_size=TRUNCATE_SIZE):
    if len(sentence) > truncate_size:
        sentence = sentence[:truncate_size]
    return [word_vocab.lookup_index(token) for token in sentence]


def collate_batch(batch):
    summary_list, text_list, index_list = list(), list(), list()
    for _text, _summary, _index in batch:
        text_list.append(torch.tensor(text_pipeline(_text), dtype=torch.long,))
        summary_list.append(
            torch.tensor(
                text_pipeline(_summary, truncate_size=len(_summary)),
                dtype=torch.long,
            )
        )
        index_list.append(torch.tensor(_index, dtype=torch.long))

    len_list = torch.tensor(list(map(len, text_list)), dtype=torch.long)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    summary_list = pad_sequence(
        summary_list, batch_first=True, padding_value=0
    )
    index_list = torch.tensor(index_list, dtype=torch.long)

    # sort the batch according to the sequence length in the descending order
    len_list, perm_idx = len_list.sort(0, descending=True)
    text_list = text_list[perm_idx]
    summary_list = summary_list[perm_idx]
    index_list = index_list[perm_idx]

    return text_list.to(DEVICE), summary_list.to(DEVICE), len_list, index_list


def get_data_loader(batch_size: int = 1, set_name="train"):
    assert set_name in ["train", "dev", "test"]
    if set_name == "train":
        return DataLoader(dataset=train_set, batch_size=batch_size, collate_fn=collate_batch, shuffle=False)
    elif set_name == "dev":
        return DataLoader(dataset=dev_set, batch_size=batch_size, collate_fn=collate_batch, shuffle=False)
    else:
        return DataLoader(dataset=test_set, batch_size=batch_size, collate_fn=collate_batch, shuffle=False,)

*****************************************Load Training Data*****************************************
*******************************************Load Dev Data*******************************************
*******************************************Load Test Data*******************************************
*******************************************Building Vocab*******************************************
Vocab Length= 515


# Encoder Class

In [12]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_len):
        embedded = self.dropout(self.embedding(src))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to("cpu"))
        packed_outputs, hidden = self.rnn(packed_embedded)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
        return outputs, hidden

# Attention Mechanism and Decoder Class

In [13]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs, mask):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        attention = attention.masked_fill(mask == 0, -1e10)
        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(
        self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention
    ):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs, mask):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs, mask)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden.squeeze(0), a.squeeze(1)

# Seq2Seq Model

In [14]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device

    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1, 0)
        return mask

    def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src, src_len)
        input = trg[0, :]
        mask = self.create_mask(src)
        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

# Truncate

In [16]:
def truncate(tokens, size):
    if size <= 0:
        return list()
    if len(tokens) <= size:
        return tokens
    to_remove_count = int((len(tokens) - size) / 2)
    return tokens[to_remove_count:-to_remove_count]


def apply_truncate(row, truncate_size):
    text = list(row["text"])
    if row["text_len"] <= truncate_size:
        return text
    clean_text = row["clean_text"]
    clean_text_len = row["clean_text_len"]
    if clean_text_len <= truncate_size:
        return clean_text
    sents = row["sents"]
    truncated_text = list()
    for sent in sents:
        tokens = [token for token in list(sent) if is_clean(token)]
        size = round(truncate_size * len(tokens) / clean_text_len)
        truncated_text += truncate(tokens, size)
    return truncated_text


def is_clean(spacy_token):
    return (
        not spacy_token.is_stop
        and not spacy_token.is_punct
        and len(str(spacy_token).strip()) > 0
    )


def apply_spacy(df):
    results = list()
    for text in tqdm(df["text"]):
        spacy_text = spacy_en(text)
        results += [spacy_text]
    return results


def get_truncated_df(df):
    display_stage("applying spacy")
    df["text"] = apply_spacy(df)
    df["text_len"] = df["text"].apply(len)
    display_stage("cleaning text")
    df["clean_text"] = df["text"].apply(
        lambda doc: [token for token in list(doc) if is_clean(token)]
    )
    df["clean_text_len"] = df["clean_text"].apply(len)

    display_stage("sentencizing")
    df["sents"] = df["text"].apply(lambda doc: list(doc.sents))

    display_stage("truncating")
    average_text_len = int(df["clean_text_len"].median())
    df["truncated_text"] = df.apply(
        lambda row: apply_truncate(row, average_text_len), axis=1
    )
    df["truncated_text_len"] = df["truncated_text"].apply(len)
    df["truncated_full_text"] = df["truncated_text"].apply(
        lambda tokens: " ".join([str(token) for token in list(tokens)])
    )
    return df


def write_truncated_train_file(df):
    display_stage("writing to file")
    file_name = "./truncated_train.txt.src"
    with open(file_name, "w") as f:
        for text in tqdm(df):
            f.write(f"{text}\n")


def generate_truncated_dataset():
    inputs = read_files(SRC_FILENAME["train"])
    df = pd.DataFrame({"text": inputs})
    df = get_truncated_df(df)
    write_truncated_train_file(df["truncated_full_text"])

generate_truncated_dataset()

# Inference

In [17]:
METRICS = ["epoch", "loss", "perplexity"] + [
    f"rouge-{x}-{m}"
    for x in ["1", "2", "l"]
    for m in ["precision", "recall", "f1"]
]


def init_report():
    return {metric: list() for metric in METRICS}

def inference(model, loader, device):
    model.eval()
    preds = list()
    for batch in tqdm(loader):
        text, _, text_len, _ = batch
        text = text = text.view(-1, 1)
        pred = summarize(text, text_len, model, device)
        preds.append(" ".join(pred))
    return preds


def summarize(text_tensor, lens, model, device, max_len=50):
    model.eval()
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(text_tensor, lens)
    mask = model.create_mask(text_tensor)
    summary_idxes = [word_vocab.lookup_index(START)]
    attentions = torch.zeros(max_len, 1, len(text_tensor)).to(device)
    for i in range(max_len):
        last_output = torch.LongTensor([summary_idxes[-1]]).to(device)
        with torch.no_grad():
            output, hidden, attention = model.decoder(last_output, hidden, encoder_outputs, mask)
        attentions[i] = attention
        pred_token = output.argmax(1).item()
        summary_idxes.append(pred_token)
        if pred_token == word_vocab.lookup_index(STOP):
            break
    summary_tokens = [word_vocab.lookup_token(idx) for idx in summary_idxes]
    return summary_tokens

# Evaluate

In [18]:
from rouge import Rouge
rouge = Rouge()

sys.setrecursionlimit(summary_max_len * summary_max_len + 10)

def calculate_rouges(preds, golds):

    result = {}
    scores = rouge.get_scores(preds, golds)
    score_df = pd.DataFrame(scores)
    for k in ["1", "2", "l"]:
        for m in ["p", "r", "f"]:
            key = f"rouge-{k}-{m}"
            value = (score_df[f"rouge-{k}"].apply(lambda score_dict: score_dict[m]).mean())
            result[key] = value
    return result


def evaluate_test(name):
    display_stage("Modeling")
    INPUT_DIM = len(word_vocab)
    OUTPUT_DIM = len(word_vocab)
    SRC_PAD_IDX = 0
    TRG_PAD_IDX = 0
    test_loader = get_data_loader(1, "test")
    test_golds = test_data.apply(lambda data: " ".join(data[1][1:])).tolist()
    attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
    model = Seq2Seq(enc, dec, SRC_PAD_IDX, DEVICE).to(DEVICE)
    display_stage("Evaluate Test Set")
    model.load_state_dict(torch.load(f"{name}-best-model.pt"))
    test_predictions = inference(model, test_loader, DEVICE)
    write_predictions(test_predictions, "test", name)

# Train

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if "weight" in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def training_pipeline(name="seq2seq"):
    display_stage("Modeling")
    INPUT_DIM = len(word_vocab)
    OUTPUT_DIM = len(word_vocab)
    SRC_PAD_IDX = 0
    TRG_PAD_IDX = 0

    attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

    model = Seq2Seq(enc, dec, SRC_PAD_IDX, DEVICE).to(DEVICE)
    model.apply(init_weights)
    print(model)
    print(f"The model has {count_parameters(model):,} trainable parameters")
    # Adam optimizer
    optimizer = optim.Adam(model.parameters())
    # Cross entropy loss
    criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

    display_stage(f"Training {name}")
    
    # Number of epochs
    N_EPOCHS = 3
    CLIP = 1

    # Load the data loader
    train_loader = get_data_loader(BATCH_SIZE, "train")
    val_loader = get_data_loader(BATCH_SIZE, "dev")
    test_loader = get_data_loader(1, "test")

    dev_golds = dev_data.apply(lambda data: " ".join(data[1][1:])).tolist()
    test_golds = test_data.apply(lambda data: " ".join(data[1][1:])).tolist()

    best_valid_loss = float("inf")
    best_dev_predictions = list()
    best_dev_scores = None
    train_report = init_report()
    valid_report = init_report()

    for epoch in range(N_EPOCHS):
        start_time = time.time()
        train_loss = train(model, train_loader, optimizer, criterion, CLIP)
        valid_loss, dev_output = evaluate(model, val_loader, criterion)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        dev_predictions = list()
        for dev_pred in dev_output:
            dev_pred_indexes = dev_pred.argmax(1)
            dev_pred = [word_vocab.lookup_token(idx.item()) for idx in dev_pred_indexes]
            dev_pred_str = " ".join(dev_pred[1:])
            dev_predictions.append(dev_pred_str)
        rouge_scores = calculate_rouges(dev_predictions, dev_golds)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_dev_predictions = dev_predictions
            write_predictions(best_dev_predictions, "dev", name)
            torch.save(model.state_dict(), f"{name}-best-model.pt")

        train_ppl = math.exp(train_loss)
        train_report["epoch"].append(epoch)
        train_report["loss"].append(train_loss)
        train_report["perplexity"].append(train_ppl)
        valid_ppl = math.exp(valid_loss)
        valid_report["epoch"].append(epoch)
        valid_report["loss"].append(valid_loss)
        valid_report["perplexity"].append(valid_ppl)
        for k in ["1", "2", "l"]:
            for m in ["precision", "recall", "f1"]:
                valid_report[f"rouge-{k}-{m}"].append(
                    rouge_scores[f"rouge-{k}-{m[0]}"]
                )
        print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
        print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {train_ppl:7.3f}")
        print(f"\t Dev Loss: {valid_loss:.3f} |  Dev PPL: {valid_ppl:7.3f}")

    display_stage("Evaluate Test Set")
    model.load_state_dict(torch.load(f"{name}-best-model.pt"))
    test_predictions = inference(model, test_loader, DEVICE)
    write_predictions(test_predictions, "test", name)


# train the model
def train(model, loader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for batch in tqdm(loader):
        text, summary, text_len, _ = batch
        batch_size = text.shape[0]
        text = text.view(-1, batch_size)
        summary = summary.view(-1, batch_size)
        optimizer.zero_grad()
        output = model(text, text_len, summary)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        summary = summary[1:].view(-1)
        # calculate the loss
        loss = criterion(output, summary)
        # back prop
        loss.backward()
        # clip gradient for avoiding gradient explosion
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)


def tensor_to_sentences(batch_token_tensor):
    return [
        " ".join(
            [
                word_vocab.lookup_token(token.item())
                for token in batch_token_tensor[:, batch_idx]
            ]
        )
        for batch_idx in range(batch_token_tensor.shape[1])
    ]


def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    outputs = list()
    with torch.no_grad():
        for batch in tqdm(loader):
            text, summary, text_len, _ = batch
            batch_size = text.shape[0]
            text = text.view(-1, batch_size)
            summary = summary.view(-1, batch_size)
            output = model(text, text_len, summary, 0)
            output_dim = output.shape[-1]
            for batch_idx in range(batch_size):
                outputs.append(output[:, batch_idx, :])
            output = output[1:].view(-1, output_dim)
            summary = summary[1:].view(-1)
            loss = criterion(output, summary)
            epoch_loss += loss.item()
    return epoch_loss / len(loader), outputs

display_stage(NAME)
training_pipeline(NAME)

**********************************************baseline**********************************************
**********************************************Modeling**********************************************
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(515, 64)
    (rnn): GRU(64, 128, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=384, out_features=128, bias=True)
      (v): Linear(in_features=128, out_features=1, bias=False)
    )
    (embedding): Embedding(515, 64)
    (rnn): GRU(320, 128)
    (fc_out): Linear(in_features=448, out_features=515, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)
The model has 701,251 trainable parameters
*****************************************Training baseline*****************************************


100%|██████████| 6250/6250 [18:03<00:00,  5.77it/s]
100%|██████████| 1671/1671 [02:48<00:00,  9.93it/s]


Epoch: 01 | Time: 20m 51s
	Train Loss: 3.532 | Train PPL:  34.185
	 Val. Loss: 3.585 |  Val. PPL:  36.071


 11%|█         | 681/6250 [01:55<15:43,  5.90it/s]


KeyboardInterrupt: ignored

# Generate Rouge Scores on Test Dataset from Predictions

In [None]:
!python3 ./evaluate.py test_summaries.txt

Val {'rouge-1': {'r': 0.5053851792910492, 'p': 0.12835899534670617, 'f': 0.19760514884490066}, 'rouge-2': {'r': 0.06536187460399981, 'p': 0.023953552731043302, 'f': 0.03344606138963046}, 'rouge-l': {'r': 0.4815303212127281, 'p': 0.12205303180572001, 'f': 0.1879470677062643}}
