# W266 Final Project - Testing Out Longformer Model (Baselines Re-evaluated)

**Description:** 

- This notebook tests out the performance of the longformer model, presented in "Longformer: The Long-Document Transformer" by Iz Beltagy, Matthew E. Peters, Arman Cohan (https://arxiv.org/abs/2004.05150).
- The model was pre-trained for MDS (multi-document summarization) in the PRIMERA model (vide "PRIMERA: Pyramid-based Masked Sentence Pre-training for Multi-document Summarization" at https://arxiv.org/abs/2110.08499).
- This is an updated notebook for adding no repeat ngram size

## Setup

In [1]:
import evaluate
from pprint import pprint

## For printing out model summary in PyTorch
from torchvision import models
from torchsummary import summary

## General plotting
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Managing memory
import gc
import pickle

## Text processing
import re
import numpy as np
from scipy import stats as st

In [2]:
from datasets import load_dataset, load_metric

In [3]:
## Checking if GPU is available when running locally
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda



## 1. Importing Longformer Model

### 1.1 Preliminaries

- Uses the HuggingFace model (https://huggingface.co/docs/transformers/model_doc/led).  This is the model with both encoder and decoder, and trained on summarization task using the arxiv dataset.
- The encoder only version is at https://huggingface.co/docs/transformers/model_doc/longformer).

In [4]:
## Importing model
from transformers import LEDModel, LEDConfig, LEDTokenizer, AutoTokenizer, LEDForConditionalGeneration, EncoderDecoderModel

In [5]:
## Loading check point

LEDmodel = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv").to("cuda").half()
LEDtokenizer = AutoTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

# LEDtokenizer2 = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

# LEDmodel = LEDModel.from_pretrained("allenai/led-base-16384")
# LEDtokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")

In [6]:
## Loading rouge
rouge = load_metric("rouge")

  rouge = load_metric("rouge")


### 1.2 Model summary and config

In [7]:
## Summary of model
print(summary(LEDmodel))

Layer (type:depth-idx)                             Param #
├─LEDModel: 1-1                                    --
|    └─Embedding: 2-1                              51,471,360
|    └─LEDEncoder: 2-2                             --
|    |    └─Embedding: 3-1                         (recursive)
|    |    └─LEDLearnedPositionalEmbedding: 3-2     16,777,216
|    |    └─ModuleList: 3-3                        188,940,288
|    |    └─LayerNorm: 3-4                         2,048
|    └─LEDDecoder: 2-3                             --
|    |    └─Embedding: 3-5                         (recursive)
|    |    └─LEDLearnedPositionalEmbedding: 3-6     1,048,576
|    |    └─ModuleList: 3-7                        201,560,064
|    |    └─LayerNorm: 3-8                         2,048
├─Linear: 1-2                                      51,471,360
Total params: 511,272,960
Trainable params: 511,272,960
Non-trainable params: 0
Layer (type:depth-idx)                             Param #
├─LEDModel: 1-1            

In [8]:
## Seeing the configuration options.

config = LEDConfig.from_pretrained("allenai/led-large-16384-arxiv")
# config = LEDConfig.from_pretrained("allenai/led-base-16384")
config

LEDConfig {
  "_name_or_path": "./",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "LEDForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_window": [
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_decoder_po

# 2. X-Science Dataset

## 2.1 Loading the dataset

In [9]:
## Loading the dataset
# xsci_train = load_dataset('multi_x_science_sum', split='train')
# xsci_val = load_dataset('multi_x_science_sum', split='validation')
xsci_test = load_dataset('multi_x_science_sum', split='test')

## For text processing as X-Science have not concatenated the source articles
DOC_SEP = "|||||"

Found cached dataset multi_x_science_sum (C:/Users/JustinTo/.cache/huggingface/datasets/multi_x_science_sum/default/1.1.0/2876ec0401f8f5c5acf7f4857dbc8d6229a390ab428321ab848f03f14b7f9729)


## 2.2 Preprocessing before tokenization

In [10]:
pat = re.compile("@cite_[0-9]+")

In [11]:
def preprocess_dataset(example):
    output = {}
    output["abstracts"] = (
        example["abstract"].split("| Abstract: ")[-1]
        + DOC_SEP
        + DOC_SEP.join([x for x in example["ref_abstract"]["abstract"] if x])
    )
    output["related_work"] = pat.sub("@cite", example["related_work"])
    
    return output

In [12]:
def preprocess_dataset_batched(example):
    output = {}
    output["abstracts"] = []
    output["related_work"] = []
    
    for abstract, ref_abstract in zip(
        example["abstract"], example["ref_abstract"]
    ):
        output["abstracts"].append(
            abstract.split("| Abstract: ")[-1]
            + DOC_SEP
            + DOC_SEP.join([x for x in ref_abstract["abstract"] if x])
        )
    for related_work in example["related_work"]:
        output["related_work"].append(pat.sub("@cite", related_work))
    
    return output

In [13]:
xsci_test_processed = xsci_test.map(
    # preprocess_dataset,
    preprocess_dataset_batched,
    remove_columns=xsci_test.column_names,
    batched=True,
    batch_size=1,
    )



  0%|          | 0/5093 [00:00<?, ?ba/s]

## 3.1 LED Large Model Experiments

### 3.1.1 Testing LED Large Model with Long Seq Len (X-Sci Dataset; Seq Len @16384)

In [14]:
test_inputs = LEDtokenizer(xsci_test_processed['abstracts'],
                           padding="max_length",
                           max_length=16384,  # maximum sequence length is 16384
                           return_tensors="pt",
                           truncation=True)

In [15]:
def generate_abstract_batched2(batch_size=1, start=0):
    
    try:
        del test_input_ids, attention_mask, global_attention_mask, predicted_abstract_ids
    except:
        None
        
    gc.collect()

    test_input_ids = test_inputs['input_ids'][start:start+batch_size].to("cuda")
    attention_mask = test_inputs['attention_mask'][start:start+batch_size].to("cuda")

    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    predicted_abstract_ids = LEDmodel.generate(test_input_ids,
                                               attention_mask=attention_mask, 
                                               global_attention_mask=global_attention_mask, 
                                               max_length=200,
                                               no_repeat_ngram_size=4,
                                               num_beams=4)

    predicted_abstract = LEDtokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    
    return predicted_abstract


In [17]:
## Generating answers
answers = []

for i in range(0, xsci_test.num_rows, 1):
    if i%1000 == 0:
        print(f"Handling sample {i} now..")
        
    answers.append(generate_abstract_batched2(start=i, batch_size=1))
    
formatted_answers = []
for answer in answers:
    formatted_answers += answer

with open("answers_revised/baselines/LED_large_16384tokens.pkl", "wb") as f:
    pickle.dump(formatted_answers, f)    
    
print(f"Completed")

Handling sample 0 now..
Handling sample 1000 now..
Handling sample 2000 now..
Handling sample 3000 now..
Handling sample 4000 now..
Handling sample 5000 now..
Completed


In [18]:
## Calculating the rouge score
rouge.compute(predictions=formatted_answers,
              references=[ref for ref in xsci_test_processed['related_work']],
              use_stemmer = True)

{'rouge1': AggregateScore(low=Score(precision=0.2913197140082046, recall=0.3511121085201865, fmeasure=0.30110728224839295), mid=Score(precision=0.2942777071641034, recall=0.353834180157716, fmeasure=0.3032110068802397), high=Score(precision=0.2970647819946683, recall=0.3565583519299773, fmeasure=0.30524418165654393)),
 'rouge2': AggregateScore(low=Score(precision=0.05116787571005442, recall=0.061544979250941684, fmeasure=0.05260665715454113), mid=Score(precision=0.05252865774759717, recall=0.0631368651203196, fmeasure=0.0539651823943207), high=Score(precision=0.05385063201779755, recall=0.06475016032114235, fmeasure=0.05528884725009268)),
 'rougeL': AggregateScore(low=Score(precision=0.1495177920033742, recall=0.18378613036803162, fmeasure=0.15535651411340642), mid=Score(precision=0.15107773796530463, recall=0.18575032182437612, fmeasure=0.15654755771443807), high=Score(precision=0.15256222870642305, recall=0.18762581376883025, fmeasure=0.15763486871502416)),
 'rougeLsum': AggregateSco

In [19]:
formatted_answers[0]

" in multi-agent environments, an intelligent agent often needs to interact with other individuals or groups of agents to achieve its goals. agent tracking is one key capability required for intelligent interaction. \n it involves monitoring the observable actions of other agents and inferring their unobserved actions, plans, goals and behaviors. \n this article examines the implications of such an agent tracking capability for agent architectures. it specifically focuses on real-time and dynamic environments, where an intelligent agent is faced with the challenge of tracking the highly flexible mix of goal-driven and reactive behaviors of other agents, in real-time. \n the key implication is that an agent architecture needs to provide direct support for flexible and efficient reasoning about other agents' models. in this article, such support takes the form of an architectural capability to execute the other agent s models, enabling mental simulation of their behaviors. \n other archi

### 3.1.2 Testing LED Large Model but with Smaller Seq Len (X-Sci Dataset; Seq Len @1024)

In [30]:
test_inputs_small = LEDtokenizer(xsci_test_processed['abstracts'],
                                 padding="max_length",
                                 max_length=1024,  # maximum sequence length is 16384
                                 return_tensors="pt",
                                 truncation=True)

In [88]:
# Modified version of the code from section 3.1.1

def generate_abstract_batched3(batch_size=1, start=0):
    
    try:
        del test_input_ids, attention_mask, global_attention_mask, predicted_abstract_ids
    except:
        None
        
    gc.collect()

    test_input_ids = test_inputs_small['input_ids'][start:start+batch_size].to("cuda")
    attention_mask = test_inputs_small['attention_mask'][start:start+batch_size].to("cuda")

    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    predicted_abstract_ids = LEDmodel.generate(test_input_ids,
                                               attention_mask=attention_mask, 
                                               global_attention_mask=global_attention_mask, 
                                               max_length=200,
                                               no_repeat_ngram_size=4,
                                               num_beams=4)

    predicted_abstract = LEDtokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    
    return predicted_abstract


In [89]:
## Generating answers
answers = []

for i in range(0, xsci_test.num_rows, 1):
    if i%1000 == 0:
        print(f"Handling sample {i} now..")
        
    answers.append(generate_abstract_batched3(start=i, batch_size=1))
    
formatted_answers = []
for answer in answers:
    formatted_answers += answer

with open("answers_revised/baselines/LED_large_1024tokens.pkl", "wb") as f:
    pickle.dump(formatted_answers, f)    
    
print(f"Completed")

Handling sample 0 now..
Handling sample 1000 now..
Handling sample 2000 now..
Handling sample 3000 now..
Handling sample 4000 now..
Handling sample 5000 now..
Completed


In [90]:
## Calculating the rouge score
rouge.compute(predictions=formatted_answers,
              references=[ref for ref in xsci_test_processed['related_work']],
              use_stemmer = True)

{'rouge1': AggregateScore(low=Score(precision=0.2921984516569064, recall=0.3473069074575013, fmeasure=0.2995858887567325), mid=Score(precision=0.29518793308554825, recall=0.3503271002951284, fmeasure=0.30173618077455855), high=Score(precision=0.2982954963768402, recall=0.3531732678321001, fmeasure=0.3039246314894796)),
 'rouge2': AggregateScore(low=Score(precision=0.0512297624679386, recall=0.06067040547048744, fmeasure=0.05228381801241776), mid=Score(precision=0.05252041705239702, recall=0.062290851744163864, fmeasure=0.05350697834303751), high=Score(precision=0.053863328378545794, recall=0.06394228423028221, fmeasure=0.05475935109034538)),
 'rougeL': AggregateScore(low=Score(precision=0.1503951973185883, recall=0.18257988404732395, fmeasure=0.15496096960282138), mid=Score(precision=0.1519573189590942, recall=0.1843591024259243, fmeasure=0.15622269193782673), high=Score(precision=0.1535220341305819, recall=0.18625186067560215, fmeasure=0.15734707670340822)),
 'rougeLsum': AggregateSco

In [91]:
formatted_answers[0]

' we present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents ), and when it should act as a simple price-taker. we provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. \n we have implemented an agent architecture, an experimental variant of the soar integrated architecture, that conforms to all of these requirements. \n agents based on this architecture have been implemented to execute two different tasks in a real-time, dynamic, multi-agent domain. \n the agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. \n our results show, among other lessons, how savvy buyers can avoid being cheated by sellers, how price volatility can be used to quantitatively predict the benef

## 3.2 LED Base Model Experiments

### 3.2.1 Loading Base Model Checkpoint and Tokenizer

In [38]:
## Loading check point

LEDmodel_small = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").to("cuda").half()
LEDtokenizer_small = AutoTokenizer.from_pretrained("allenai/led-base-16384")

In [39]:
summary(LEDmodel_small)

Layer (type:depth-idx)                             Param #
├─LEDModel: 1-1                                    --
|    └─Embedding: 2-1                              38,603,520
|    └─LEDEncoder: 2-2                             --
|    |    └─Embedding: 3-1                         (recursive)
|    |    └─LEDLearnedPositionalEmbedding: 3-2     12,582,912
|    |    └─ModuleList: 3-3                        53,157,888
|    |    └─LayerNorm: 3-4                         1,536
|    └─LEDDecoder: 2-3                             --
|    |    └─Embedding: 3-5                         (recursive)
|    |    └─LEDLearnedPositionalEmbedding: 3-6     786,432
|    |    └─ModuleList: 3-7                        56,710,656
|    |    └─LayerNorm: 3-8                         1,536
├─Linear: 1-2                                      38,603,520
Total params: 200,448,000
Trainable params: 200,448,000
Non-trainable params: 0


Layer (type:depth-idx)                             Param #
├─LEDModel: 1-1                                    --
|    └─Embedding: 2-1                              38,603,520
|    └─LEDEncoder: 2-2                             --
|    |    └─Embedding: 3-1                         (recursive)
|    |    └─LEDLearnedPositionalEmbedding: 3-2     12,582,912
|    |    └─ModuleList: 3-3                        53,157,888
|    |    └─LayerNorm: 3-4                         1,536
|    └─LEDDecoder: 2-3                             --
|    |    └─Embedding: 3-5                         (recursive)
|    |    └─LEDLearnedPositionalEmbedding: 3-6     786,432
|    |    └─ModuleList: 3-7                        56,710,656
|    |    └─LayerNorm: 3-8                         1,536
├─Linear: 1-2                                      38,603,520
Total params: 200,448,000
Trainable params: 200,448,000
Non-trainable params: 0

### 3.2.2 Testing LED Base Model with Longer Seq Len (X-Sci Dataset; Seq Len @16384)

In [40]:
test_inputs_base = LEDtokenizer_small(xsci_test_processed['abstracts'],
                                      padding="max_length",
                                      max_length=16384,  # maximum sequence length is 16384
                                      return_tensors="pt",
                                      truncation=True)

In [56]:
def generate_abstract_batched4(batch_size=1, start=0):
    
    try:
        del test_input_ids, attention_mask, global_attention_mask, predicted_abstract_ids
    except:
        None
        
    gc.collect()

    test_input_ids = test_inputs_base['input_ids'][start:start+batch_size].to("cuda")
    attention_mask = test_inputs_base['attention_mask'][start:start+batch_size].to("cuda")

    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    predicted_abstract_ids = LEDmodel_small.generate(test_input_ids,
                                                     attention_mask=attention_mask, 
                                                     global_attention_mask=global_attention_mask, 
                                                     max_length=200,
                                                     no_repeat_ngram_size=4, 
                                                     num_beams=4)

    predicted_abstract = LEDtokenizer_small.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    
    return predicted_abstract

In [57]:
## Generating answers
answers = []

for i in range(0, xsci_test.num_rows, 1):
    if i%1000 == 0:
        print(f"Handling sample {i} now..")
        
    answers.append(generate_abstract_batched4(start=i, batch_size=1))
    
formatted_answers = []
for answer in answers:
    formatted_answers += answer

with open("answers_revised/baselines/LED_base_16384tokens.pkl", "wb") as f:
    pickle.dump(formatted_answers, f)    
    
print(f"Completed")

Handling sample 0 now..
Handling sample 1000 now..
Handling sample 2000 now..
Handling sample 3000 now..
Handling sample 4000 now..
Handling sample 5000 now..
Completed


In [58]:
## Calculating the rouge score
rouge.compute(predictions=formatted_answers,
              references=[ref for ref in xsci_test_processed['related_work']],
              use_stemmer = True)

{'rouge1': AggregateScore(low=Score(precision=0.25848294611538875, recall=0.39694077557804125, fmeasure=0.2971504191769784), mid=Score(precision=0.2612696670437389, recall=0.3993607737351673, fmeasure=0.2993982629284738), high=Score(precision=0.2640676837094545, recall=0.4017511229335403, fmeasure=0.3017288907002012)),
 'rouge2': AggregateScore(low=Score(precision=0.04424987524222377, recall=0.06855382662173488, fmeasure=0.051023846425017214), mid=Score(precision=0.04521699624109907, recall=0.06983147198255521, fmeasure=0.052046383544997285), high=Score(precision=0.04613457649451515, recall=0.07123889258639861, fmeasure=0.052999055898971054)),
 'rougeL': AggregateScore(low=Score(precision=0.12803163431948122, recall=0.20301561988672906, fmeasure=0.14820893941565505), mid=Score(precision=0.129230531297316, recall=0.2047038732435292, fmeasure=0.149144492827062), high=Score(precision=0.13047072298556073, recall=0.20618751148429912, fmeasure=0.1501718556227401)),
 'rougeLsum': AggregateSco

In [59]:
formatted_answers[0]

"We present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents), and when it should act as a simple price-taker. We provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. The agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. Our results show, among other lessons, how savvy buyers can avoid being cheated'' by sellers, how price volatility can be used to quantitatively predict the benefits of deeper models, and how specific types of agent populations influence system behavior.|||||In multi-agent environments, an intelligent agent often needs to interact with other individuals or groups of agents to achieve its goals. Agent tracking is one key capability required for intell

### 3.2.3 Testing LED Base Model with Shorter Seq Len (X-Sci Dataset; Seq Len @1024)

In [60]:
# Modified version of the code from section 3.1

test_inputs_base_short = LEDtokenizer_small(xsci_test_processed['abstracts'],
                                            padding="max_length",
                                            max_length=1024,  # maximum sequence length is 16384
                                            return_tensors="pt",
                                            truncation=True)

def generate_abstract_batched5(batch_size=1, start=0):
    
    try:
        del test_input_ids, attention_mask, global_attention_mask, predicted_abstract_ids
    except:
        None
        
    gc.collect()

    test_input_ids = test_inputs_base_short['input_ids'][start:start+batch_size].to("cuda")
    attention_mask = test_inputs_base_short['attention_mask'][start:start+batch_size].to("cuda")

    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    predicted_abstract_ids = LEDmodel_small.generate(test_input_ids,
                                                     attention_mask=attention_mask, 
                                                     global_attention_mask=global_attention_mask, 
                                                     max_length=200,
                                                     no_repeat_ngram_size=4,
                                                     num_beams=4)

    predicted_abstract = LEDtokenizer_small.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    
    return predicted_abstract

In [76]:
## Generating answers
answers = []

for i in range(0, xsci_test.num_rows, 1):
    if i%1000 == 0:
        print(f"Handling sample {i} now..")
        
    answers.append(generate_abstract_batched4(start=i, batch_size=1))
    
formatted_answers = []

for answer in answers:
    formatted_answers += answer

with open("answers_revised/baselines/LED_base_1024tokens.pkl", "wb") as f:
    pickle.dump(formatted_answers, f)    
    
print(f"Completed")

Handling sample 0 now..
Handling sample 1000 now..
Handling sample 2000 now..
Handling sample 3000 now..
Handling sample 4000 now..
Handling sample 5000 now..
Completed


In [77]:
## Calculating the rouge score
rouge.compute(predictions=formatted_answers,
              references=[ref for ref in xsci_test_processed['related_work']],
              use_stemmer = True)

{'rouge1': AggregateScore(low=Score(precision=0.25848294611538875, recall=0.39694077557804125, fmeasure=0.2971504191769784), mid=Score(precision=0.2612696670437389, recall=0.3993607737351673, fmeasure=0.2993982629284738), high=Score(precision=0.2640676837094545, recall=0.4017511229335403, fmeasure=0.3017288907002012)),
 'rouge2': AggregateScore(low=Score(precision=0.04424987524222377, recall=0.06855382662173488, fmeasure=0.051023846425017214), mid=Score(precision=0.04521699624109907, recall=0.06983147198255521, fmeasure=0.052046383544997285), high=Score(precision=0.04613457649451515, recall=0.07123889258639861, fmeasure=0.052999055898971054)),
 'rougeL': AggregateScore(low=Score(precision=0.12803163431948122, recall=0.20301561988672906, fmeasure=0.14820893941565505), mid=Score(precision=0.129230531297316, recall=0.2047038732435292, fmeasure=0.149144492827062), high=Score(precision=0.13047072298556073, recall=0.20618751148429912, fmeasure=0.1501718556227401)),
 'rougeLsum': AggregateSco

In [80]:
formatted_answers[0]

"We present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents), and when it should act as a simple price-taker. We provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. The agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. Our results show, among other lessons, how savvy buyers can avoid being cheated'' by sellers, how price volatility can be used to quantitatively predict the benefits of deeper models, and how specific types of agent populations influence system behavior.|||||In multi-agent environments, an intelligent agent often needs to interact with other individuals or groups of agents to achieve its goals. Agent tracking is one key capability required for intell

# 4. Comparing the Results from the 4 Experiments
### (i.e. LED Large vs Base; 16384 vs 1024 Seq Len)

In [42]:
## Inputs: abstract of main article plus those of relevant works
xsci_test_processed[0]['abstracts']

"We present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents), and when it should act as a simple price-taker. We provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. The agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. Our results show, among other lessons, how savvy buyers can avoid being cheated'' by sellers, how price volatility can be used to quantitatively predict the benefits of deeper models, and how specific types of agent populations influence system behavior.|||||In multi-agent environments, an intelligent agent often needs to interact with other individuals or groups of agents to achieve its goals. Agent tracking is one key capability required for intell

In [40]:
## Label: target summary
xsci_test_processed[0]['related_work']

'Within the MAS community, some work @cite has focused on how artificial AI-based learning agents would fare in communities of similar agents. For example, @cite and @cite show how agents can learn the capabilities of others via repeated interactions, but these agents do not learn to predict what actions other might take. Most of the work in MAS also fails to recognize the possible gains from using explicit agent models to predict agent actions. @cite is an exception and gives another approach for using nested agent models. However, they do not go so far as to try to quantify the advantages of their nested models or show how these could be learned via observations. We believe that our research will bring to the foreground some of the common observations seen in these research areas and help to clarify the implications and utility of learning and using nested agent models.'

In [34]:
## Large LED; 16384 input seq len
answers_run1[0]

[" in multi-agent environments, an intelligent agent often needs to interact with other individuals or groups of agents to achieve its goals. agent tracking is one key capability required for intelligent interaction. \n it involves monitoring the observable actions of other agents and inferring their unobserved actions, plans, goals and behaviors. \n this article examines the implications of such an agent tracking capability for agent architectures. it specifically focuses on real-time and dynamic environments, where an intelligent agent is faced with the challenge of tracking the highly flexible mix of goal-driven and reactive behaviors of other agents, in real-time. \n the key implication is that an agent architecture needs to provide direct support for flexible and efficient reasoning about other agents' models. in this article, such support takes the form of an architectural capability to execute the other agent s models, enabling mental simulation of their behaviors. \n other arch

In [35]:
## Large LED; 1024 input seq len
answers_run2[0]

' we present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents ), and when it should act as a simple price-taker. we provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. \n we have implemented an agent architecture, an experimental variant of the soar integrated architecture, that conforms to all of these requirements. \n agents based on this architecture have been implemented to execute two different tasks in a real-time, dynamic, multi-agent domain. \n the agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. \n our results show, among other lessons, how savvy buyers can avoid being cheated by sellers, how price volatility can be used to quantitatively predict the benef

In [36]:
## Base LED; 16384 input seq len
answers_run3[0]

["We present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents), and when it should act as a simple price-taker. We provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. The agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. Our results show, among other lessons, how savvy buyers can avoid being cheated'' by sellers, how price volatility can be used to quantitatively predict the benefits of deeper models, and how specific types of agent populations influence system behavior.|||||In multi-agent environments, an intelligent agent often needs to interact with other individuals or groups of agents to achieve its goals. Agent tracking is one key capability required for intel

In [52]:
## Base LED; 1024 input seq len
answers_run4[0]

"We present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents), and when it should act as a simple price-taker. We provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. The agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. Our results show, among other lessons, how savvy buyers can avoid being cheated'' by sellers, how price volatility can be used to quantitatively predict the benefits of deeper models, and how specific types of agent populations influence system behavior.|||||In multi-agent environments, an intelligent agent often needs to interact with other individuals or groups of agents to achieve its goals. Agent tracking is one key capability required for intell

## 4.4 Tokenization

In [26]:
test_inputs = LEDtokenizer(xsci_test_processed['abstracts'],
                           padding="max_length",
                           max_length=16384,  # maximum sequence length is 16384
                           return_tensors="pt",
                           truncation=True)

In [27]:
(test_inputs['input_ids'].shape, test_inputs['attention_mask'].shape)

(torch.Size([5093, 16384]), torch.Size([5093, 16384]))

In [28]:
# train_labels = LEDtokenizer(xsci_train_processed['related_work'],
#                             padding=True,
#                             return_tensors="pt")

# val_labels = LEDtokenizer(xsci_val_processed['related_work'],
#                           padding=True,
#                           return_tensors="pt")

test_labels = LEDtokenizer(xsci_test_processed['related_work'],
                           padding=True,
                           return_tensors="pt")

In [29]:
(test_labels['input_ids'].shape, test_labels['attention_mask'].shape)

(torch.Size([5093, 418]), torch.Size([5093, 418]))

# 5 Testing Baseline Performance (X-Sci)

## 5.1 Performance on X-Sci Test Set (without Fine-tuning)

In [76]:
# Modified version of the code from section 3.1

def generate_abstract_batched2(batch_size=1, start=0):
    
    try:
        del test_input_ids, attention_mask, global_attention_mask, predicted_abstract_ids
    except:
        None
        
    gc.collect()

    test_input_ids = test_inputs['input_ids'][start:start+batch_size].to("cuda")
    attention_mask = test_inputs['attention_mask'][start:start+batch_size].to("cuda")

    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    predicted_abstract_ids = LEDmodel.generate(test_input_ids,
                                               attention_mask=attention_mask, 
                                               global_attention_mask=global_attention_mask, 
                                               max_length=200, 
                                               num_beams=4)

    predicted_abstract = LEDtokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    
    return predicted_abstract


In [102]:
## Generating answers
answers = []

for i in range(xsci_test.num_rows):
    if i%500 == 0:
        print(f"Handling sample {i} now..")
        
    answers.append(generate_abstract_batched2(start=i))
    
print(f"Completed, {i+1} data points from the {xsci_test.num_rows} validation samples handled.")

Handling sample 0 now..
Handling sample 500 now..
Handling sample 1000 now..
Handling sample 1500 now..
Handling sample 2000 now..
Handling sample 2500 now..
Handling sample 3000 now..
Handling sample 3500 now..
Handling sample 4000 now..
Handling sample 4500 now..
Handling sample 5000 now..
Completed, 5093 data points from the 5093 validation samples handled.


In [104]:
answers[5092]

[' in this paper we present a novel approach to global localization using an RGB-D camera in maps of visual features. \n we first estimate a candidate pose using few correspondences between features of the current camera frame and the feature map. \n this initial guess is refined by applying the iterative closest point algorithm on the dense point cloud data. \n the initial pose estimate is used in the second step to guide spatial matching of features in 3d, i.e. searching for associations where the image features are expected to be found in the map. a rigorous error analysis assesses several sets of RGB-D ground truth data via an error accumulation metric. \n extensive quantitative analysis on recently proposed benchmark datasets shows that the proposed solution is faster than a state-of-the-art implementation of the iterative closest point (ICP) algorithm by two orders of magnitude. \n we provide both source code and datasets to the community in order to accelerate further comparison

In [109]:
## Calculating the rouge score
rouge.compute(predictions=answers,
              references=[ref for ref in xsci_test_processed['related_work']],
              rouge_types=["rouge1", "rouge2", "rougeL"])

{'rouge1': AggregateScore(low=Score(precision=0.2479334528085974, recall=0.321662740702613, fmeasure=0.2656359845804365), mid=Score(precision=0.2505538804830633, recall=0.32434961718819, fmeasure=0.2676419755779038), high=Score(precision=0.25311515348932667, recall=0.32674620988455055, fmeasure=0.26968110552642927)),
 'rouge2': AggregateScore(low=Score(precision=0.04208470379749971, recall=0.054505164305182964, fmeasure=0.044889962120228916), mid=Score(precision=0.043237782767000396, recall=0.055880661862500416, fmeasure=0.04602699170565005), high=Score(precision=0.044423785565852324, recall=0.05741837862734922, fmeasure=0.04719075528503129)),
 'rougeL': AggregateScore(low=Score(precision=0.13136900075756824, recall=0.17480547320661932, fmeasure=0.1417307185301011), mid=Score(precision=0.13287612302831447, recall=0.17658333934184312, fmeasure=0.14297957661683836), high=Score(precision=0.13433230218541875, recall=0.17835717890130903, fmeasure=0.1441459907960088))}

In [111]:
## Calculating the rouge score
rouge.compute(predictions=answers,
              references=[ref for ref in xsci_test_processed['related_work']],
              use_stemmer = True)

{'rouge1': AggregateScore(low=Score(precision=0.27453591534623567, recall=0.35673694721845467, fmeasure=0.29427693554999856), mid=Score(precision=0.27731963028174667, recall=0.35956675143370853, fmeasure=0.296440336846706), high=Score(precision=0.28010380522713363, recall=0.3621360269146316, fmeasure=0.2985793362893389)),
 'rouge2': AggregateScore(low=Score(precision=0.048086635024810025, recall=0.062402351771787656, fmeasure=0.0513552124793895), mid=Score(precision=0.04935625535265088, recall=0.06390869056261983, fmeasure=0.05258844734808345), high=Score(precision=0.0506199460005532, recall=0.06554982437977665, fmeasure=0.05385863030724751)),
 'rougeL': AggregateScore(low=Score(precision=0.1412959886760316, recall=0.18832278308329203, fmeasure=0.1526079380903488), mid=Score(precision=0.142863661330691, recall=0.19018608460748399, fmeasure=0.15382853986425082), high=Score(precision=0.14436612555409067, recall=0.19207024297974315, fmeasure=0.15504718726736894)),
 'rougeLsum': AggregateS

## Sandbox

In [132]:
sum(np.array(len_tr_sources[0]))

334

In [133]:
max([1,2,3,4])

4

In [135]:
token_numbers_tr_sources = []
token_numbers_v_sources = []

for tup in len_tr_sources:
    token_numbers_tr_sources.append(sum(np.array(tup)))
    
for tup in len_v_sources:
    token_numbers_v_sources.append(sum(np.array(tup)))
    
print(f"The longest input in training set is {max(token_numbers_tr_sources)} tokens.")
print(f"The longest input in validation set is {max(token_numbers_v_sources)} tokens.")

The longest input in training set is 4694 tokens.
The longest input in validation set is 4183 tokens.


In [141]:
len_tr_labels.sort(reverse=True)
len_tr_labels

[753,
 680,
 609,
 431,
 384,
 375,
 359,
 355,
 355,
 353,
 353,
 351,
 346,
 346,
 344,
 342,
 340,
 339,
 339,
 338,
 338,
 337,
 337,
 336,
 336,
 335,
 335,
 335,
 334,
 333,
 332,
 332,
 331,
 331,
 330,
 330,
 329,
 329,
 328,
 328,
 328,
 328,
 327,
 326,
 326,
 326,
 326,
 326,
 325,
 325,
 324,
 324,
 324,
 324,
 323,
 323,
 323,
 323,
 323,
 322,
 322,
 321,
 321,
 321,
 320,
 320,
 320,
 319,
 319,
 319,
 319,
 319,
 319,
 319,
 318,
 318,
 318,
 317,
 317,
 317,
 317,
 316,
 316,
 316,
 316,
 316,
 316,
 316,
 315,
 315,
 315,
 314,
 314,
 314,
 314,
 314,
 313,
 313,
 313,
 312,
 312,
 312,
 312,
 312,
 312,
 312,
 312,
 312,
 312,
 311,
 311,
 311,
 311,
 311,
 311,
 311,
 310,
 310,
 310,
 310,
 310,
 310,
 310,
 310,
 309,
 309,
 309,
 309,
 309,
 309,
 309,
 308,
 308,
 308,
 308,
 308,
 308,
 308,
 308,
 308,
 308,
 308,
 308,
 308,
 308,
 307,
 307,
 307,
 307,
 307,
 307,
 307,
 307,
 307,
 306,
 306,
 306,
 306,
 306,
 305,
 305,
 305,
 305,
 305,
 305,
 305,
 305

In [138]:
max(len_v_labels)

324