In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os
import json
import csv
import html
from transformers import BertTokenizer, BertModel
from tqdm.notebook import tqdm
from time import sleep

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
class Scorer(nn.Module):
    def __init__(self):
        super(Scorer, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(self.model.config.hidden_size, 1)
    
    def forward(self, article, summary):
        inputs = self.tokenizer(article, summary, padding='longest', truncation="longest_first" , return_tensors='pt').to(device)
        outputs = self.model(**inputs)
        x = self.fc(outputs.pooler_output)
        return x

In [4]:
# testm = Scorer()
# outputs = testm.forward(['Today is a great day', 'Today is not a good day'], ['Good day', 'Bad day'])
# print(outputs)



In [5]:
class Siamese(nn.Module):
    def __init__(self):
        super(Siamese, self).__init__()
        self.base_model = Scorer()
    
    def forward(self, article, summary1, summary2):
        out1 = self.base_model(article, summary1)
        out2 = self.base_model(article, summary2)
        out = torch.cat((out1, out2), -1)
        return out

In [6]:
# testm = Siamese()
# outputs = testm.forward(['Today is a great day', 'Today is not a good day'], ['Good day', 'Bad day'], ['Bad day', 'Good day'])
# print(outputs)

In [7]:
# DATASET='billsum'
# DATASET_ROOT= '../exp/data/'
# METHOD = 'ordered_siam'

In [8]:
class CustomDataset(Dataset):
    def __init__(self, datapath, nums=None):
        self.data = []
        with open(datapath, "r", encoding="utf-8") as f:
            for line in f:
                elements = line.split('\t')
                size = len(elements)
                for i in range(1, size-1):
                    self.data.append([elements[0], elements[i], elements[i+1]])
                # Limit the number of lines used
                if nums is not None:
                    nums -= 1
                    if nums == 0:
                        break

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx][0], self.data[idx][1], self.data[idx][2]

In [9]:
# train_set = CustomDataset(os.path.join(DATASET_ROOT, DATASET, METHOD, 'train.tsv'))
# print(len(train_set))
# train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)

In [10]:
# model = Siamese()
# model.to(device)

In [11]:
def train_model(model, train_set, max_iter=10000, tune=False):
    epochs = 1
    optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)
    loss_fn = nn.CrossEntropyLoss()
    train_dataloader = DataLoader(train_set, batch_size=12, shuffle=True)

    while tune or epochs > 0:
        running_loss = 0.0
        model.train()
        with tqdm(total=min(len(train_dataloader), max_iter)) as pbar:
            for j, (article, sum1, sum2) in enumerate(train_dataloader):
                if j >= max_iter:
                    break
                output = model(article, sum1, sum2)
                labels = torch.tensor([0]*len(article), dtype=torch.long).to(device)
                loss = loss_fn(output, labels)

                #accuracy
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                
                pbar.update(1)
                if j % 100 == 99:
                    # pbar.write("Iteration {}, Loss {}".format(j+1, running_loss))
                    running_loss = 0
        
        epochs -= 1
        if tune:
            model.eval()
            acc = 0.0
            for i in tqdm(range(len(train_set))):
                article, sum1, sum2 = train_set[i]
                output = model([article], [sum1], [sum2])
                if output[0][0] > output[0][1]:
                    acc += 1
            
            # Overfit on train set
            acc /= len(train_set)
            print(acc)
            if acc > 0.9:
                break
        

In [12]:
# CKPT_PATH = os.path.join("../exp/result_bert_base_uncased", DATASET, METHOD, "model.pth")
# if not os.path.exists(os.path.dirname(CKPT_PATH)):
#     os.makedirs(os.path.dirname(CKPT_PATH))

# scorer = model.base_model
# torch.save(scorer.state_dict(), CKPT_PATH)
# scorer.eval()

In [13]:
# Evaluate for TAC2010
def evaluate_tac(json_file, output_path, scorer):
    with open(output_path, "w") as f:
        examples = []
        tac = json.load(open(json_file, 'r', encoding="utf-8"))
        for docset in tac.keys():
            for article in tac[docset]["articles"]: 
                # each of 10 articles is a list of strings 
                article = " ".join(article)
                article = article.replace("\n", " ")
                article = article.replace("\t", " ")
                if len(article) == 0:
                    article = " ." 

                _doc = ' '.join(article.split()[0:400])

                for summarizer in tac[docset]["summaries"].keys():
                    summary = " ".join(tac[docset]['summaries'][summarizer]['sentences']) 
                    # no need for [0] since we changed the format of jsonfile
                    summary = summary.replace("\n", " ")
                    summary = summary.replace("\t", " ")
                    if len(summary) == 0:
                        summary = " ."

                    _sum = ' '.join(summary.split()[0:200])
                    
                    label = scorer([article], [summary]).detach().cpu().numpy()[0][0]
                    f.write(str(label) + "\n")

In [14]:
# evaluate_tac("TAC2010_all.json", os.path.join("../exp/result_bert_base_uncased", DATASET, METHOD, "test_results_tac.tsv"))

In [15]:
def evaluate_newsroom(csv_file, output_path, scorer):
    with open(output_path, "w") as f:
        with open(csv_file, "r", encoding="utf-8") as csvfile: 
            reader = csv.reader(csvfile, delimiter=",", quotechar="\"") 
            counter = 0 
            for row in reader: 
                if counter > 0:
                    [_doc, _sum] = row[2:4]
                    _doc = _doc.replace("</p><p>", "")
                    _sum = _sum.replace("</p><p>", "")
                    _doc=html.unescape(_doc) 
                    _sum=html.unescape(_sum) 

                    label = scorer([_doc], [_sum]).detach().cpu().numpy()[0][0]
                    f.write(str(label) + "\n")
                counter += 1

In [16]:
# evaluate_newsroom("newsroom-human-eval.csv", os.path.join("../exp/result_bert_base_uncased", DATASET, METHOD, "test_results_newsroom.tsv"))

In [17]:
def evaluate_realsumm(tsv_file, output_path, scorer):
    with open(output_path, "w") as f:
        with open(tsv_file, "r", encoding="utf-8") as tsv:
            for line in tsv:
                line = line.split('\t')
                _doc = ' '.join(line[0].split())
                for j in range(1, len(line)) :
                    _sum = ' '.join(line[j].split())
                    
                    label = scorer([_doc], [_sum]).detach().cpu().numpy()[0][0]
                    f.write(str(label) + "\n")

In [18]:
# evaluate_realsumm("realsumm_100.tsv", os.path.join("../exp/result_bert_base_uncased", DATASET, METHOD, "test_results_realsumm.tsv"))

In [19]:
def evaluate_summeval(tsv_file, output_path, scorer):
    with open(output_path, "w") as f:
        with open(tsv_file, "r", encoding="utf-8") as tsv:
            for line in tsv:
                line = line.split('\t')
                _doc = ' '.join(line[0].split())
                _sum = ' '.join(line[1].split())
                
                label = scorer([_doc], [_sum]).detach().cpu().numpy()[0][0]
                f.write(str(label) + "\n")

In [20]:
DATASET=['billsum', 'scientific_papers', 'cnn_dailymail', 'big_patent'] #
DATASET_ROOT= '../exp/data/'
RESULT_ROOT = "../exp/result_bert_base_uncased"
METHOD = 'ordered_siam_2'

In [21]:
# for dataset in DATASET:
#     print("Loading {}".format(dataset))
#     train_set = CustomDataset(os.path.join(DATASET_ROOT, dataset, METHOD, 'train.tsv'))
#     print(len(train_set))
    
#     model = Siamese()
#     model.to(device)
    
#     print("Training...")
#     train_model(model, train_set)
    
#     CKPT_PATH = os.path.join(RESULT_ROOT, dataset, METHOD, "model.pth")
#     if not os.path.exists(os.path.dirname(CKPT_PATH)):
#         os.makedirs(os.path.dirname(CKPT_PATH))

#     scorer = model.base_model
#     torch.save(scorer.state_dict(), CKPT_PATH)
#     scorer.eval()
    
#     print("Evaluating...")
#     evaluate_tac("TAC2010_all.json", os.path.join(RESULT_ROOT, dataset, METHOD, "test_results_tac.tsv"), scorer)
#     evaluate_newsroom("newsroom-human-eval.csv", os.path.join(RESULT_ROOT, dataset, METHOD, "test_results_newsroom.tsv"), scorer)
#     evaluate_realsumm("realsumm_100.tsv", os.path.join(RESULT_ROOT, dataset, METHOD, "test_results_realsumm.tsv"), scorer)
    
#     del scorer
#     del model
#     torch.cuda.empty_cache()
    
#     # break

In [25]:
for dataset in DATASET:
    CKPT_PATH = os.path.join(RESULT_ROOT, dataset, METHOD, "model.pth")
    
    model = Siamese()
    model.base_model.load_state_dict(torch.load(CKPT_PATH))
    model.to(device)
    
    scorer = model.base_model
    scorer.eval()
    
    evaluate_summeval("summeval_100.tsv", os.path.join(RESULT_ROOT, dataset, METHOD, "test_results_summeval.tsv"), scorer)
    
    del scorer
    del model
    torch.cuda.empty_cache()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_rela

In [None]:
# # Tune & test on REALSUMM
# samples = 2737

# tune_path = os.path.join(RESULT_ROOT, 'cnn_dailymail', METHOD)

# for nums in range(0, samples+1, samples):
#     train_set = CustomDataset('realsumm_tune.tsv', nums)
#     print(len(train_set))
    
#     model = Siamese()
#     model.base_model.load_state_dict(torch.load(os.path.join(tune_path, 'model.pth')))
#     model.to(device)
    
#     print("Tuning...")
#     if nums > 0:
#         train_model(model, train_set, tune=True)
    
#     scorer = model.base_model
#     scorer.eval()
    
#     evaluate_realsumm("realsumm_100.tsv", os.path.join(tune_path, "test_results_realsumm_{}.tsv").format(nums), scorer)
    
#     del scorer
#     del model
#     torch.cuda.empty_cache()