In [1]:
import transformers
import datasets
import torch
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preprocessing
sep_token = '<sep>'
dataset_name = "squad"
models_dir = "saved_models/bart_base-squad"
checkpoint = "facebook/bart-base"
max_input_length = 512
max_target_length = 64

## Training
learning_rate = 1e-4
num_epochs = 5

In [3]:
dataset = datasets.load_dataset(dataset_name)

Found cached dataset squad (C:/Users/manuv/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00,  5.93it/s]


In [4]:
print(len(dataset["train"]))

87599


In [5]:
dataset["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [6]:
model = transformers.BartForConditionalGeneration.from_pretrained("facebook/bart-base")
tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/bart-base")

In [7]:
tokenizer.sep_token = sep_token
tokenizer.add_tokens([sep_token])
model.resize_token_embeddings(len(tokenizer))

Embedding(50266, 768)

In [8]:
import pandas as pd
# Group questions and answer with same context together
train_df = pd.DataFrame(dataset["train"])
val_df = pd.DataFrame(dataset["validation"])

train_df = train_df.groupby("context").agg({"question": list, "answers": list})
val_df = val_df.groupby("context").agg({"question": list, "answers": list})
train_dataset = datasets.Dataset.from_pandas(train_df.reset_index())
val_dataset = datasets.Dataset.from_pandas(val_df.reset_index())

dataset["train"] = train_dataset
dataset["validation"] = val_dataset

In [9]:
# Tokenize examples
def convert_to_features(example_batch):

    input_encodings = tokenizer.batch_encode_plus(example_batch['context'], 
                                                  max_length=max_input_length, 
                                                  add_special_tokens=True,
                                                  truncation=True, 
                                                  pad_to_max_length=True)
    
    target_encodings = tokenizer.batch_encode_plus(example_batch['question'], 
                                                   max_length=max_target_length, 
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)
                                                   
    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
    example['context'] = example['context'] + " " + sep_token
    example['question'] = " <sep> ".join(example['question'])
    return example


def add_special_tokens(example):
  example['question'] = example['question'].replace("{sep_token}", sep_token)
  return example

In [10]:
tokenized_dataset  = dataset.map(add_eos_examples)
tokenized_dataset = tokenized_dataset.map(add_special_tokens)
tokenized_dataset  = tokenized_dataset.map(convert_to_features,  batched=True)

                                                                   

In [11]:
tokenized_dataset["train"][0]["question"]

'When did the torch arrive in Canberra? <sep> Who received the flame from Chinese officials in Canberra? <sep> What did Agnes Shea give to the Chinese in return? <sep> Who publicly argued at a press conference? <sep> Where was the relay held in Australia? <sep> How many kilometers was the route in Australia? <sep> What was the location of the beginning of the Olympic torch route in Australia? <sep> Who kept the demonstrators separated? <sep> What is the name of the Aboriginal elder who received the torch from Chinese officials?'

In [12]:
tokenized_dataset["train"][0].keys()

dict_keys(['context', 'question', 'answers', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'])

In [13]:
tokenized_dataset = tokenized_dataset.remove_columns(
    ["context","question","answers"]
)

train_dataset = tokenized_dataset["train"]
valid_dataset = tokenized_dataset["validation"]

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

In [14]:
torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')

In [15]:
from typing import Dict, List

class T2TDataCollator():
    def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
        A dictionary of tensors
        """

        input_ids = torch.stack([example['input_ids'] for example in batch])
        lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
        lm_labels[lm_labels[:, :] == 0] = -100 
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

        return {
            'input_ids': input_ids, 
            'attention_mask': attention_mask,
            'labels': lm_labels, 
            'decoder_attention_mask': decoder_attention_mask
        }

In [16]:
training_args = transformers.TrainingArguments(output_dir=models_dir, 
                                  per_device_train_batch_size=4, 
                                  per_device_eval_batch_size=4,
                                  gradient_accumulation_steps=16,
                                  learning_rate=learning_rate, 
                                  num_train_epochs=num_epochs,
                                  logging_steps=100,
                                  run_name="bart_qg_squad",
                                  evaluation_strategy="steps",
                                  save_steps=500)

In [17]:
logger = logging.getLogger(__name__)

# Initialize our Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator()
)


In [18]:
if model.device.type == 'cuda':
    print('Model is on GPU')
else:
    print('Model is on CPU')

Model is on GPU


In [19]:
trainer.train() # 80min

  7%|▋         | 100/1475 [04:54<1:07:42,  2.95s/it]

{'loss': 2.3547, 'learning_rate': 9.322033898305085e-05, 'epoch': 0.34}


                                                    
  7%|▋         | 100/1475 [05:23<1:07:42,  2.95s/it]

{'eval_loss': 1.6700265407562256, 'eval_runtime': 29.8951, 'eval_samples_per_second': 69.142, 'eval_steps_per_second': 17.294, 'epoch': 0.34}


 14%|█▎        | 200/1475 [10:15<1:01:58,  2.92s/it]

{'loss': 1.7229, 'learning_rate': 8.644067796610171e-05, 'epoch': 0.68}


                                                    
 14%|█▎        | 200/1475 [10:45<1:01:58,  2.92s/it]

{'eval_loss': 1.60895574092865, 'eval_runtime': 29.9233, 'eval_samples_per_second': 69.077, 'eval_steps_per_second': 17.277, 'epoch': 0.68}


 20%|██        | 300/1475 [15:37<57:02,  2.91s/it]  

{'loss': 1.658, 'learning_rate': 7.966101694915254e-05, 'epoch': 1.02}


                                                  
 20%|██        | 300/1475 [16:07<57:02,  2.91s/it]

{'eval_loss': 1.573846459388733, 'eval_runtime': 29.9704, 'eval_samples_per_second': 68.968, 'eval_steps_per_second': 17.25, 'epoch': 1.02}


 27%|██▋       | 400/1475 [20:58<52:18,  2.92s/it]  

{'loss': 1.4825, 'learning_rate': 7.288135593220338e-05, 'epoch': 1.36}


                                                  
 27%|██▋       | 400/1475 [21:28<52:18,  2.92s/it]

{'eval_loss': 1.5615453720092773, 'eval_runtime': 29.9529, 'eval_samples_per_second': 69.008, 'eval_steps_per_second': 17.26, 'epoch': 1.36}


 34%|███▍      | 500/1475 [26:20<47:22,  2.92s/it]  

{'loss': 1.4861, 'learning_rate': 6.610169491525424e-05, 'epoch': 1.69}


                                                  
 34%|███▍      | 500/1475 [26:50<47:22,  2.92s/it]

{'eval_loss': 1.551958441734314, 'eval_runtime': 29.9257, 'eval_samples_per_second': 69.071, 'eval_steps_per_second': 17.276, 'epoch': 1.69}


 41%|████      | 600/1475 [31:47<42:30,  2.91s/it]  

{'loss': 1.4583, 'learning_rate': 5.932203389830509e-05, 'epoch': 2.03}


                                                  
 41%|████      | 600/1475 [32:17<42:30,  2.91s/it]

{'eval_loss': 1.563263177871704, 'eval_runtime': 29.9603, 'eval_samples_per_second': 68.991, 'eval_steps_per_second': 17.256, 'epoch': 2.03}


 47%|████▋     | 700/1475 [37:09<37:24,  2.90s/it]  

{'loss': 1.3296, 'learning_rate': 5.254237288135594e-05, 'epoch': 2.37}


                                                  
 47%|████▋     | 700/1475 [37:39<37:24,  2.90s/it]

{'eval_loss': 1.5508701801300049, 'eval_runtime': 29.9364, 'eval_samples_per_second': 69.046, 'eval_steps_per_second': 17.27, 'epoch': 2.37}


 54%|█████▍    | 800/1475 [42:27<32:23,  2.88s/it]  

{'loss': 1.3326, 'learning_rate': 4.5762711864406784e-05, 'epoch': 2.71}


                                                  
 54%|█████▍    | 800/1475 [42:57<32:23,  2.88s/it]

{'eval_loss': 1.5446569919586182, 'eval_runtime': 29.7818, 'eval_samples_per_second': 69.405, 'eval_steps_per_second': 17.36, 'epoch': 2.71}


 61%|██████    | 900/1475 [47:45<27:33,  2.88s/it]  

{'loss': 1.312, 'learning_rate': 3.898305084745763e-05, 'epoch': 3.05}


                                                  
 61%|██████    | 900/1475 [48:15<27:33,  2.88s/it]

{'eval_loss': 1.5582462549209595, 'eval_runtime': 29.7545, 'eval_samples_per_second': 69.468, 'eval_steps_per_second': 17.376, 'epoch': 3.05}


 68%|██████▊   | 1000/1475 [53:03<22:46,  2.88s/it] 

{'loss': 1.2359, 'learning_rate': 3.2203389830508473e-05, 'epoch': 3.39}


                                                   
 68%|██████▊   | 1000/1475 [53:32<22:46,  2.88s/it]

{'eval_loss': 1.5484000444412231, 'eval_runtime': 29.7745, 'eval_samples_per_second': 69.422, 'eval_steps_per_second': 17.364, 'epoch': 3.39}


 75%|███████▍  | 1100/1475 [58:59<17:59,  2.88s/it]  

{'loss': 1.2267, 'learning_rate': 2.5423728813559322e-05, 'epoch': 3.73}


                                                   
 75%|███████▍  | 1100/1475 [59:28<17:59,  2.88s/it]

{'eval_loss': 1.5419151782989502, 'eval_runtime': 29.7948, 'eval_samples_per_second': 69.375, 'eval_steps_per_second': 17.352, 'epoch': 3.73}


 81%|████████▏ | 1200/1475 [1:04:16<13:11,  2.88s/it]

{'loss': 1.2128, 'learning_rate': 1.864406779661017e-05, 'epoch': 4.07}


                                                     
 81%|████████▏ | 1200/1475 [1:04:46<13:11,  2.88s/it]

{'eval_loss': 1.5518202781677246, 'eval_runtime': 29.9318, 'eval_samples_per_second': 69.057, 'eval_steps_per_second': 17.273, 'epoch': 4.07}


 88%|████████▊ | 1300/1475 [1:09:34<08:23,  2.88s/it]

{'loss': 1.1624, 'learning_rate': 1.1864406779661018e-05, 'epoch': 4.4}


                                                     
 88%|████████▊ | 1300/1475 [1:10:04<08:23,  2.88s/it]

{'eval_loss': 1.5454031229019165, 'eval_runtime': 29.7898, 'eval_samples_per_second': 69.386, 'eval_steps_per_second': 17.355, 'epoch': 4.4}


 95%|█████████▍| 1400/1475 [1:14:52<03:35,  2.88s/it]

{'loss': 1.163, 'learning_rate': 5.084745762711865e-06, 'epoch': 4.74}


                                                     
 95%|█████████▍| 1400/1475 [1:15:22<03:35,  2.88s/it]

{'eval_loss': 1.5468823909759521, 'eval_runtime': 29.7908, 'eval_samples_per_second': 69.384, 'eval_steps_per_second': 17.354, 'epoch': 4.74}


100%|██████████| 1475/1475 [1:18:58<00:00,  3.21s/it]

{'train_runtime': 4738.3341, 'train_samples_per_second': 19.934, 'train_steps_per_second': 0.311, 'train_loss': 1.4243852311473781, 'epoch': 5.0}





TrainOutput(global_step=1475, training_loss=1.4243852311473781, metrics={'train_runtime': 4738.3341, 'train_samples_per_second': 19.934, 'train_steps_per_second': 0.311, 'train_loss': 1.4243852311473781, 'epoch': 5.0})

In [20]:
# save the model
trainer.save_model(models_dir)

In [26]:
from transformers import BartForConditionalGeneration, AutoTokenizer
# load the saved model
loaded_model = BartForConditionalGeneration.from_pretrained(models_dir)

In [27]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [34]:
def run_model(input_string, **generator_args):
    generator_args = {
    "max_length": 512,
    "num_beams": 4,# bij grotere num_beams is trager maar complexere vragen(niet per se betere vragen)
    "length_penalty": 1.5,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
    }
    input_string = input_string + " </s>"
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = loaded_model.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    output = [item.split("<sep>") for item in output]
    return output

In [41]:
context = """
Cheese is an ancient food whose origins predate recorded history. There is no conclusive evidence indicating where cheesemaking originated, whether in Europe, Central Asia or the Middle East. Earliest proposed dates for the origin of cheesemaking range from around 8000 BCE, when sheep were first domesticated. Since animal skins and inflated internal organs have, since ancient times, provided storage vessels for a range of foodstuffs, it is probable that the process of cheese making was discovered accidentally by storing milk in a container made from the stomach of an animal, resulting in the milk being turned to curd and whey by the rennet from the stomach.[7] There is a legend—with variations—about the discovery of cheese by an Arab trader who used this method of storing milk.[8]

The earliest evidence of cheesemaking in the archaeological record dates back to 5500 BCE and is found in what is now Kuyavia, Poland, where strainers coated with milk-fat molecules have been found.[9]

Cheesemaking may have begun independently of this by the pressing and salting of curdled milk to preserve it. Observation that the effect of making cheese in an animal stomach gave more solid and better-textured curds may have led to the deliberate addition of rennet. Early archeological evidence of Egyptian cheese has been found in Egyptian tomb murals, dating to about 2000 BCE.[10] A 2018 scientific paper stated that the world's oldest cheese, dating to approximately 1200 BCE (3200 years before present), was found in ancient Egyptian tombs.[11][12]

The earliest cheeses were likely quite sour and salty, similar in texture to rustic cottage cheese or feta, a crumbly, flavorful Greek cheese. Cheese produced in Europe, where climates are cooler than the Middle East, required less salt for preservation. With less salt and acidity, the cheese became a suitable environment for useful microbes and molds, giving aged cheeses their respective flavors. The earliest ever discovered preserved cheese was found in the Taklamakan Desert in Xinjiang, China, dating back as early as 1615 BCE (3600 years before present).
"""

context_gf = """
#### Types of Bias

**Selection bias** is the tendency to skew your choice of data sources to 
those that are easily available, convenient, and/or cost-effective. As a 
result of this a bias is introduced by the selection of individuals, groups 
or data for analysis in such a way that proper randomization is not achieved, 
thereby ensuring that the sample obtained is not representative of the 
population intended to be analyzed. 
[Learn more about selection bias](https://en.wikipedia.org/wiki/Selection_bias).

**Self-selection bias** is a form of selection bias where you get the data 
from sources that “volunteered” to provide it. Most poll data has this type  
of bias. [Learn more about self-selection bias](https://en.wikipedia.org/wiki/Self-selection_bias)

**Omitted-variable bias** happens when your featurized data doesn't have a 
feature necessary for accurate prediction. For example, let's assume that you 
are working on a churn prediction model and you want to predict whether a 
customer cancels their subscription within six months. You train a model, and 
it's accurate enough; however, several weeks after deployment you see many 
unexpected false negatives. You investigate the decreased model performance 
and discover a new competitor now offers a very similar service for a lower 
price. This feature wasn't initially available to your model, therefore 
important information for accurate prediction was missing. 
[Learn more about omitted variable Bias](https://en.wikipedia.org/wiki/Omitted-variable_bias).
"""

run_model(context_gf)

[['What is the tendency to skew your choice of data sources to �those that are easily available, convenient, and/or cost-effective"?  What happens when your featurized data doesn\'t have a feature necessary for accurate prediction?']]