In [3]:
!pip install transformers datasets
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import json
import torch
# from lfqa_utils import *

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloa

In [4]:
def qa_s2s_generate(
    question_doc,
    qa_s2s_model,
    qa_s2s_tokenizer,
    num_answers=1,
    num_beams=None,
    min_len=64,
    max_len=256,
    do_sample=False,
    temp=1.0,
    top_p=None,
    top_k=None,
    max_input_length=512,
    device="cuda:0",
):
    model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, max_input_length, device=device,)
    n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
    generated_ids = qa_s2s_model.generate(
        input_ids=model_inputs["input_ids"],
        attention_mask=model_inputs["attention_mask"],
        min_length=min_len,
        max_length=max_len,
        do_sample=do_sample,
        early_stopping=True,
        num_beams=1 if do_sample else n_beams,
        temperature=temp,
        top_k=top_k,
        top_p=top_p,
        eos_token_id=qa_s2s_tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        num_return_sequences=num_answers,
        decoder_start_token_id=qa_s2s_tokenizer.bos_token_id,
    )
    return [qa_s2s_tokenizer.decode(ans_ids, skip_special_tokens=True).strip() for ans_ids in generated_ids]

In [5]:
def make_qa_s2s_batch(qa_list, tokenizer, max_len=64, max_a_len=360, device="cuda:0"):
    q_ls = [q for q, a in qa_list]
    a_ls = [a for q, a in qa_list]
    q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True)
    q_ids, q_mask = (
        torch.LongTensor(q_toks["input_ids"]).to(device),
        torch.LongTensor(q_toks["attention_mask"]).to(device),
    )
    a_toks = tokenizer.batch_encode_plus(a_ls, max_length=min(max_len, max_a_len), pad_to_max_length=True)
    a_ids, a_mask = (
        torch.LongTensor(a_toks["input_ids"]).to(device),
        torch.LongTensor(a_toks["attention_mask"]).to(device),
    )
    lm_labels = a_ids[:, 1:].contiguous().clone()
    lm_labels[a_mask[:, 1:].contiguous() == 0] = -100
    model_inputs = {
        "input_ids": q_ids,
        "attention_mask": q_mask,
        "decoder_input_ids": a_ids[:, :-1].contiguous(),
        "lm_labels": lm_labels,
    }
    return model_inputs

In [13]:
model_checkpoint = 'yjernite/bart_eli5'#"facebook/bart-base"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cuda:0')
model.eval()

assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [7]:
tokenizer("What is your name?", "My name is Sylvain.")

{'input_ids': [0, 2264, 16, 110, 766, 116, 2, 2, 2387, 766, 16, 28856, 1851, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
# này là facebook/bart-base
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base').to('cuda:0')
model.eval()

path = "/content/drive/MyDrive/Dataset/Q1/ELI5.jsonl"
f = open(path, "r")

for id, line in enumerate(f):
  data = json.loads(line)
  # print(data)

  question = data['question']
  doc = "".join(data['ctxs'])


  question_doc = "question: {} context: {}".format(question, doc)
  answer = qa_s2s_generate(
            question_doc, model, tokenizer,
            num_answers=1,
            num_beams=8,
            min_len=64,
            max_len=256,
            max_input_length=1024
    )[0]

  print(id+1)
  print("---Q----" + question)
  # print("---C----" + doc)
  print("---A----" + answer)
  print()

  if id == 10:
    break




1
---Q----in football whats the point of wasting the first two plays with a rush - up the middle - not regular rush plays i get those

2
---Q----Why are different tiers (regular < mid < premium) of gas' prices almost always 10 cents different?
---A----question: Why are different tiers (regular < mid < premium) of gas' prices almost always 10 cents different? context: have traditionally been three to four times the price in the United States, with prices during 2000–2005 of €1.42 per litre ($5.4/US gal) while the US had prices around $1.50 per US gallon ($0.40/l). After a large increase until the summer of 2008, the price of fuel in the U.S. peaked at $2.00/litre (about $0.50/l) (29 December 2008). The price of gasoline in the US peaked in the spring of 2008 at $3.53/l (27 December 2008) (27 January 2009). The average price of gas in Europe, however, was up 19% to $103.24 (29 January 2009) during the peak of the 2008 energy crisis, with the average price in Europe rising to $4.85 per li

In [14]:
# này là yjernite/bart_eli5
model = AutoModelForSeq2SeqLM.from_pretrained('yjernite/bart_eli5').to('cuda:0')
model.eval()

path = "/content/drive/MyDrive/Dataset/Q1/ELI5.jsonl"
f = open(path, "r")

for id, line in enumerate(f):
  data = json.loads(line)
  # print(data)

  question = data['question']
  doc = "".join(data['ctxs'])


  question_doc = "question: {} context: {}".format(question, doc)
  answer = qa_s2s_generate(
            question_doc, model, tokenizer,
            num_answers=1,
            num_beams=8,
            min_len=64,
            max_len=256,
            max_input_length=1024
    )[0]

  print(id+1)
  print("---Q----" + question)
  # print("---C----" + doc)
  print("---A----" + answer)
  print()

  if id == 10:
    break


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1
---Q----in football whats the point of wasting the first two plays with a rush - up the middle - not regular rush plays i get those
---A----The point of a rush is to force the offense to run the ball. If the offense doesn't have the ball, they don't have to run it. If they do, it's a waste of time and energy to try and force the ball down the middle of the field instead of running it up the middle.

2
---Q----Why are different tiers (regular < mid < premium) of gas' prices almost always 10 cents different?
---A----It's called [price discrimination]( URL_0 ) and it's a very real thing. It's not just gas stations, it's all kinds of things. For example, if you're buying a gallon of gas, you're more likely to be willing to pay more for premium gas than regular gas, so the price of premium gas is going to be higher.

3
---Q----Stars and Visibility
---A----Stars are not visible to the naked eye. They are visible to a telescope, but not to the human eye. You can see stars with your naked ey

In [9]:
# user_to_repos = {}

# max_length = 384 # The maximum length of a feature (question and context)
# doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

# dataset = Dataset.from_json("/content/drive/MyDrive/Dataset/Q1/ELI5.jsonl")
# dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

KeyboardInterrupt: ignored