# Multi XScience

In [1]:
import re

from datasets import load_dataset, load_metric
import evaluate
import nltk
import nltk.data
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AdamW, AutoTokenizer, AutoModelForSeq2SeqLM

import gc
import pickle

## For printing out model summary in PyTorch
from torchvision import models
from torchsummary import summary

In [2]:
DATASET_NAME = "multi_x_science_sum"
DOC_SEP = " ||||| "
BATCH_SIZE = 64

## Set up evaluation

In [3]:
rouge = load_metric("rouge")

  rouge = load_metric("rouge")


## Load dataset

In [4]:
dataset = load_dataset(DATASET_NAME)

Found cached dataset multi_x_science_sum (C:/Users/JustinTo/.cache/huggingface/datasets/multi_x_science_sum/default/1.1.0/2876ec0401f8f5c5acf7f4857dbc8d6229a390ab428321ab848f03f14b7f9729)


  0%|          | 0/3 [00:00<?, ?it/s]

## Format dataset to our needs

In [5]:
pat = re.compile("@cite_[0-9]+")

In [6]:
def preprocess_dataset(example):
    output = {}
    output["abstracts"] = (
        example["abstract"].split("| Abstract: ")[-1]
        + DOC_SEP
        + DOC_SEP.join([x for x in example["ref_abstract"]["abstract"] if x])
    )
    output["related_work"] = pat.sub("@cite", example["related_work"])
    
    return output

In [7]:
def preprocess_dataset_batched(example):
    output = {}
    output["abstracts"] = []
    output["related_work"] = []
    
    for abstract, ref_abstract in zip(
        example["abstract"], example["ref_abstract"]
    ):
        output["abstracts"].append(
            abstract.split("| Abstract: ")[-1]
            + DOC_SEP
            + DOC_SEP.join([x for x in ref_abstract["abstract"] if x])
        )
    for related_work in example["related_work"]:
        output["related_work"].append(pat.sub("@cite", related_work))
    
    return output

In [8]:
dataset_processed = {}
for split in dataset.keys():
    dataset_processed[split] = dataset[split].map(
        # preprocess_dataset,
        preprocess_dataset_batched,
        remove_columns=dataset[split].column_names,
        batched=True,
        batch_size=BATCH_SIZE,
    )



  0%|          | 0/475 [00:00<?, ?ba/s]

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/80 [00:00<?, ?ba/s]

## Model 1: Default Centrum

* Probably need to figure out the distribution of `dataset_processed["test"]["abstracts"]` so that we can estimate the best `max_length`.

In [9]:
CHECKPOINT = "ratishsp/Centrum"

In [29]:
def get_tokenizer(host_tokenizer: str):
  """return the tokenizer and model for LLM training"""

  return (AutoTokenizer.from_pretrained(host_tokenizer, 
                                        use_cache=False, 
                                        gradient_checkpointing=True), 
          AutoModelForSeq2SeqLM.from_pretrained(host_tokenizer, 
                                                use_cache=False, 
                                                gradient_checkpointing=True).to("cuda").half())


centrum_tokenizer, centrum_model = get_tokenizer(CHECKPOINT)

In [30]:
centrum_tokenizer.add_tokens(DOC_SEP, special_tokens=True)
centrum_model.resize_token_embeddings(len(centrum_tokenizer))
docsep_token_id = centrum_tokenizer.convert_tokens_to_ids(DOC_SEP)

In [31]:
centrum_tokenizer

PreTrainedTokenizerFast(name_or_path='ratishsp/Centrum', vocab_size=50265, model_max_len=16384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [32]:
centrum_model

LEDForConditionalGeneration(
  (led): LEDModel(
    (shared): Embedding(50267, 768)
    (encoder): LEDEncoder(
      (embed_tokens): Embedding(50267, 768)
      (embed_positions): LEDLearnedPositionalEmbedding(4096, 768)
      (layers): ModuleList(
        (0): LEDEncoderLayer(
          (self_attn): LEDEncoderAttention(
            (longformer_self_attn): LEDEncoderSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNor

## Token Length 1024 (no_repeat_ngram = 4)

In [12]:
dataset_tokenized = {}

dataset_tokenized["test"] = centrum_tokenizer(
    dataset_processed["test"]["abstracts"],
    padding=True,
    truncation=True,
    max_length=1024,
    return_tensors="pt",
)

In [15]:
def generate_abstract_batched(batch_size=2, start=0, no_repeat_ngram_size=4, max_length=256):
    
    try:
        del test_input_ids, attention_mask, global_attention_mask, predicted_abstract_ids
    except:
        None
        
    gc.collect()

    test_input_ids = dataset_tokenized['test']['input_ids'][start:start+batch_size].to("cuda")
    attention_mask = dataset_tokenized['test']['attention_mask'][start:start+batch_size].to("cuda")

    global_attention_mask = (test_input_ids == centrum_tokenizer.cls_token_id) | (test_input_ids == docsep_token_id)

    predicted_abstract_ids = centrum_model.generate(test_input_ids,
                                                    attention_mask=attention_mask, 
                                                    global_attention_mask=global_attention_mask, 
                                                    max_length=max_length,
                                                    no_repeat_ngram_size=no_repeat_ngram_size,
                                                    num_beams=4)

    predicted_abstract = centrum_tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    
    return predicted_abstract

In [26]:
## Generating answers
test_batch_size = 2
no_repeat_ngram_size = 4
answers_fixed = []

for i in range(0, dataset_processed['test'].num_rows, test_batch_size):
    if i%500 == 0:
        print(f"Handling sample {i} now..")
        
    answers_fixed.append(generate_abstract_batched(start=i,
                                                   batch_size=test_batch_size,
                                                   no_repeat_ngram_size=no_repeat_ngram_size))
    
print(f"Completed, {i+1} data points handled.")

Handling sample 0 now..
Handling sample 500 now..
Handling sample 1000 now..
Handling sample 1500 now..
Handling sample 2000 now..
Handling sample 2500 now..
Handling sample 3000 now..
Handling sample 3500 now..
Handling sample 4000 now..
Handling sample 4500 now..
Handling sample 5000 now..
Completed, 5093 data points handled.


In [33]:
formatted_answers_fixed = []
for answer in answers_fixed:
    formatted_answers_fixed += answer

## Pickling results
with open("answers_revised/baselines/Centrum_1024tokens_norepeat4.pkl", "wb") as f:
    pickle.dump(formatted_answers_fixed, f)

In [35]:
## Calculating the rouge score
metric_norepeat4 = rouge.compute(predictions=formatted_answers_fixed,
                                 references=[ref for ref in dataset_processed['test']['related_work']],
                                 use_stemmer = True)

In [36]:
metric_norepeat4

{'rouge1': AggregateScore(low=Score(precision=0.2265826960587613, recall=0.43772636218005384, fmeasure=0.2864612831459573), mid=Score(precision=0.22894222834918954, recall=0.44021451741739237, fmeasure=0.2886998899529029), high=Score(precision=0.23152466853877887, recall=0.4427925426116824, fmeasure=0.2909696381737882)),
 'rouge2': AggregateScore(low=Score(precision=0.040909721576971134, recall=0.08008098039670647, fmeasure=0.05185027455132552), mid=Score(precision=0.04179481992509422, recall=0.08173496474357636, fmeasure=0.05290083284921959), high=Score(precision=0.04269202893554369, recall=0.08341225044733727, fmeasure=0.053954731610736106)),
 'rougeL': AggregateScore(low=Score(precision=0.11227537986637294, recall=0.2250679160619483, fmeasure=0.14342061182623692), mid=Score(precision=0.11336789988753487, recall=0.22708980488324895, fmeasure=0.14446791950329646), high=Score(precision=0.1144758750227722, recall=0.22907433802060162, fmeasure=0.14553273116036605)),
 'rougeLsum': Aggrega

## Token Length 1024 (no_repeat_ngram = 3)

In [16]:
## Generating answers
test_batch_size = 2
no_repeat_ngram_size = 3
answers_fixed = []

for i in range(0, dataset_processed['test'].num_rows, test_batch_size):
    if i%500 == 0:
        print(f"Handling sample {i} now..")
        
    answers_fixed.append(generate_abstract_batched(start=i,
                                                   batch_size=test_batch_size,
                                                   no_repeat_ngram_size=no_repeat_ngram_size))
    
print(f"Completed, {i+1} data points handled.")

Handling sample 0 now..
Handling sample 500 now..
Handling sample 1000 now..
Handling sample 1500 now..
Handling sample 2000 now..
Handling sample 2500 now..
Handling sample 3000 now..
Handling sample 3500 now..
Handling sample 4000 now..
Handling sample 4500 now..
Handling sample 5000 now..
Completed, 5093 data points handled.


In [17]:
formatted_answers_fixed = []
for answer in answers_fixed:
    formatted_answers_fixed += answer

## Pickling results
with open("answers_revised/baselines/Centrum_1024tokens_norepeat3.pkl", "wb") as f:
    pickle.dump(formatted_answers_fixed, f)

In [18]:
## Calculating the rouge score
metric_norepeat3 = rouge.compute(predictions=formatted_answers_fixed,
                                 references=[ref for ref in dataset_processed['test']['related_work']],
                                 use_stemmer = True)

In [19]:
metric_norepeat3

{'rouge1': AggregateScore(low=Score(precision=0.22772499301159588, recall=0.4390357876855743, fmeasure=0.2875707196492456), mid=Score(precision=0.2300670823131827, recall=0.4417820787237462, fmeasure=0.2897487979471861), high=Score(precision=0.23261393435265582, recall=0.4442250579937219, fmeasure=0.29214149159364144)),
 'rouge2': AggregateScore(low=Score(precision=0.04077113757375007, recall=0.07959749558397478, fmeasure=0.05154900659003259), mid=Score(precision=0.041657608261410364, recall=0.08130459685209915, fmeasure=0.05265059264335756), high=Score(precision=0.04254403465352925, recall=0.08299856187724529, fmeasure=0.05373046044217472)),
 'rougeL': AggregateScore(low=Score(precision=0.11234791749286922, recall=0.22510530992302918, fmeasure=0.14332531130561366), mid=Score(precision=0.11339646233716678, recall=0.22706235943254338, fmeasure=0.14431975307778994), high=Score(precision=0.11441829800428202, recall=0.22891465102910727, fmeasure=0.14535622627126812)),
 'rougeLsum': Aggrega

## Token length 4096 (Centrum does not allow 16384)

In [33]:
dataset_tokenized_large = {}

dataset_tokenized_large["test"] = centrum_tokenizer(
    dataset_processed["test"]["abstracts"],
    padding=True,
    truncation=True,
    max_length=4096,
    return_tensors="pt",
)

In [34]:
def generate_abstract_batched2(batch_size=2, start=0, no_repeat_ngram_size=4, max_length=256):
    
    try:
        del test_input_ids, attention_mask, global_attention_mask, predicted_abstract_ids
    except:
        None
        
    gc.collect()

    test_input_ids = dataset_tokenized_large['test']['input_ids'][start:start+batch_size].to("cuda")
    attention_mask = dataset_tokenized_large['test']['attention_mask'][start:start+batch_size].to("cuda")

    global_attention_mask = (test_input_ids == centrum_tokenizer.cls_token_id) | (test_input_ids == docsep_token_id)

    predicted_abstract_ids = centrum_model.generate(test_input_ids,
                                                    attention_mask=attention_mask, 
                                                    global_attention_mask=global_attention_mask, 
                                                    max_length=max_length,
                                                    no_repeat_ngram_size=no_repeat_ngram_size,
                                                    num_beams=4)

    predicted_abstract = centrum_tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    
    return predicted_abstract

In [35]:
## Generating answers
test_batch_size = 1
no_repeat_ngram_size = 4
answers_fixed = []

for i in range(0, dataset_processed['test'].num_rows, test_batch_size):
    if i%500 == 0:
        print(f"Handling sample {i} now..")
        
    answers_fixed.append(generate_abstract_batched2(start=i,
                                                    batch_size=test_batch_size,
                                                    no_repeat_ngram_size=no_repeat_ngram_size))
    
print(f"Completed, {i+1} data points handled.")

Handling sample 0 now..
Handling sample 500 now..
Handling sample 1000 now..
Handling sample 1500 now..
Handling sample 2000 now..
Handling sample 2500 now..
Handling sample 3000 now..
Handling sample 3500 now..
Handling sample 4000 now..
Handling sample 4500 now..
Handling sample 5000 now..
Completed, 5093 data points handled.


In [36]:
formatted_answers_fixed = []
for answer in answers_fixed:
    formatted_answers_fixed += answer

## Pickling results
with open("answers_revised/baselines/Centrum_4096tokens.pkl", "wb") as f:
    pickle.dump(formatted_answers_fixed, f)

In [41]:
## Calculating the rouge score
metric_4096tokens = rouge.compute(predictions=formatted_answers_fixed,
                                  references=[ref for ref in dataset_processed['test']['related_work']],
                                  use_stemmer = True)

In [42]:
metric_4096tokens

{'rouge1': AggregateScore(low=Score(precision=0.22612544658583159, recall=0.4361118833312366, fmeasure=0.2856490787565765), mid=Score(precision=0.22838993642640357, recall=0.4386777508413613, fmeasure=0.28776083468712477), high=Score(precision=0.2308064347985721, recall=0.44139529892575774, fmeasure=0.289934362041135)),
 'rouge2': AggregateScore(low=Score(precision=0.040538178391298994, recall=0.07943614423251627, fmeasure=0.051375506410180134), mid=Score(precision=0.04139440297278668, recall=0.08112594717471461, fmeasure=0.05244024798225367), high=Score(precision=0.04230090098514175, recall=0.08282356336360785, fmeasure=0.05353833157856666)),
 'rougeL': AggregateScore(low=Score(precision=0.1124225601329699, recall=0.22517898089951363, fmeasure=0.14339040973864686), mid=Score(precision=0.11341719273858765, recall=0.22713309798623907, fmeasure=0.14441935599279224), high=Score(precision=0.11456579383649804, recall=0.2289604324736872, fmeasure=0.14545339988984682)),
 'rougeLsum': Aggregat

In [37]:
formatted_answers_fixed[0]

"We present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents), and when it should act as a simple price-taker. We provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. The agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. Our results show, among other lessons, how savvy buyers can avoid being cheated'' by sellers, how price volatility can be used to quantitatively predict the benefits of deeper models, and how specific types of agent populations influence system behavior. This article examines the implications of such an agent tracking capability for agent architectures. It specifically focuses on real-time and dynamic environments, where an intelligent agent is faced

In [39]:
dataset_processed['test']['related_work'][0]

'Within the MAS community, some work @cite has focused on how artificial AI-based learning agents would fare in communities of similar agents. For example, @cite and @cite show how agents can learn the capabilities of others via repeated interactions, but these agents do not learn to predict what actions other might take. Most of the work in MAS also fails to recognize the possible gains from using explicit agent models to predict agent actions. @cite is an exception and gives another approach for using nested agent models. However, they do not go so far as to try to quantify the advantages of their nested models or show how these could be learned via observations. We believe that our research will bring to the foreground some of the common observations seen in these research areas and help to clarify the implications and utility of learning and using nested agent models.'

In [40]:
dataset_processed['test']['abstracts'][0]

"We present our approach to the problem of how an agent, within an economic Multi-Agent System, can determine when it should behave strategically (i.e. learn and use models of other agents), and when it should act as a simple price-taker. We provide a framework for the incremental implementation of modeling capabilities in agents, and a description of the forms of knowledge required. The agents were implemented and different populations simulated in order to learn more about their behavior and the merits of using and learning agent models. Our results show, among other lessons, how savvy buyers can avoid being cheated'' by sellers, how price volatility can be used to quantitatively predict the benefits of deeper models, and how specific types of agent populations influence system behavior. ||||| In multi-agent environments, an intelligent agent often needs to interact with other individuals or groups of agents to achieve its goals. Agent tracking is one key capability required for inte