In [1]:
import transformers
from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
import json
import torch
from time import time
import pandas as pd
from peft import PeftModel
from nltk import PorterStemmer
from rouge import Rouge
import spacy
from spacy.tokenizer import Tokenizer
from time import time

In [2]:
class ArgumentsS2S():
    def __init__(self):
        self.batch_size = 4
        self.backward_freq = 16
        self.max_length = 768
        self.print_freq = 10000
        self.model_save_name = "D:\Gradients\seq2seq_models\led"
        self.learning_rate = 3e-4
        self.num_epochs = 1
        self.device = 'cuda:0'

s2s_args = ArgumentsS2S()

In [3]:
class ELI5DatasetS2S(Dataset):
    def __init__(
        self,
        data_array,
    ):
        self.data = data_array

    def __len__(self):
        return len(self.data)

    def append(self, question_doc, answer):
        self.data.append([question_doc, answer])

    # def make_example(self, idx):
    #     i, j = self.qa_id_list[idx]
    #     example = self.data[i]
    #     question = example["title"] + " " + example["selftext"]
    #     answer = example["answers"]["text"][j]
    #     q_id = example["q_id"]
    #     if self.make_doc_function is not None:
    #         self.document_cache[q_id] = self.document_cache.get(q_id, self.make_doc_function(example["title"]))
    #     document = self.document_cache[q_id]
    #     in_st = "question: {} context: {}".format(
    #         question.lower().replace(" --t--", "").strip(), document.lower().strip(),
    #     )
    #     out_st = answer
    #     return (in_st, out_st)

    def __getitem__(self, idx):
        return (self.data[idx][0], self.data[idx][1])

In [4]:
def make_qa_s2s_batch(qa_list, tokenizer, max_len=64, max_a_len=360, device="cuda:0"):
    q_ls = [q for q, a in qa_list]
    a_ls = [a for q, a in qa_list]
    q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True)
    q_ids, q_mask = (
        torch.LongTensor(q_toks["input_ids"]).to(device),
        torch.LongTensor(q_toks["attention_mask"]).to(device),
    )
    a_toks = tokenizer.batch_encode_plus(a_ls, max_length=min(max_len, max_a_len), pad_to_max_length=True)
    a_ids, a_mask = (
        torch.LongTensor(a_toks["input_ids"]).to(device),
        torch.LongTensor(a_toks["attention_mask"]).to(device),
    )
    lm_labels = a_ids[:, 1:].contiguous().clone()
    lm_labels[a_mask[:, 1:].contiguous() == 0] = -100
    model_inputs = {
        "input_ids": q_ids,
        "attention_mask": q_mask,
        "decoder_input_ids": a_ids[:, :-1].contiguous(),
        "labels": lm_labels,
    }
    return model_inputs

In [5]:
def qa_s2s_generate(
    question_doc,
    qa_s2s_model,
    qa_s2s_tokenizer,
    num_answers=1,
    num_beams=None,
    min_len=64,
    max_len=256,
    do_sample=False,
    temp=1.0,
    top_p=None,
    top_k=None,
    max_input_length=512,
    device="cuda:0",
):
    model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, max_input_length, device=device,)
    n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
    generated_ids = qa_s2s_model.generate(
        input_ids=model_inputs["input_ids"],
        attention_mask=model_inputs["attention_mask"],
        min_length=min_len,
        max_length=max_len,
        do_sample=do_sample,
        early_stopping=True,
        num_beams=1 if do_sample else n_beams,
        temperature=temp,
        top_k=top_k,
        top_p=top_p,
        eos_token_id=qa_s2s_tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        num_return_sequences=num_answers,
        decoder_start_token_id=qa_s2s_tokenizer.bos_token_id,
    )
    return [qa_s2s_tokenizer.decode(ans_ids, skip_special_tokens=True).strip() for ans_ids in generated_ids]

In [6]:
from peft import PeftConfig, PeftModel

model_id = "flan-t5-small-lora"

config = PeftConfig.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/flan-t5-small"
).to(s2s_args.device)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

inference_model = PeftModel.from_pretrained(model=model, model_id=model_id)
inference_model.print_trainable_parameters()

trainable params: 0 || all params: 82,466,176 || trainable%: 0.0


In [7]:
predicted = []
reference = []

try:
  f.close()
except:
  print("No file to close")

path = "Bản sao của ELI5_val.jsonl"
f = open(path, "r")

val_data = ELI5DatasetS2S([])

for id, line in enumerate(f):
  if id == 50:
    break
  # print(id)
  data = json.loads(line)
  # print(data)

  question = data['question']
  doc = '. '.join(map(str, data['ctxs']))
  answer_true = '. '.join(map(str, data['answers']))

  question_doc = "question: {} context: {}".format(question, doc)
  answer_pred = qa_s2s_generate(
            question_doc, inference_model, tokenizer,
            num_answers=1,
            num_beams=8,
            min_len=96,
            max_len=256,
            max_input_length=512,
            device="cuda:0"
    )[0]
  predicted += [answer_pred]
  reference += [answer_true]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


No file to close




In [9]:
stemmer = PorterStemmer()
rouge = Rouge()
nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer(nlp.vocab)

def compute_rouge_eli5(compare_list):
    preds = [" ".join([stemmer.stem(str(w))
                       for w in tokenizer(pred)])
             for gold, pred in compare_list]
    golds = [" ".join([stemmer.stem(str(w))
                       for w in tokenizer(gold)])
             for gold, pred in compare_list]
    scores = rouge.get_scores(preds, golds, avg=True)
    return scores


compare_list = [(g, p) for p, g in zip(predicted, reference)]
scores = compute_rouge_eli5(compare_list)
df = pd.DataFrame({
    'rouge1': [scores['rouge-1']['p'], scores['rouge-1']['r'], scores['rouge-1']['f']],
    'rouge2': [scores['rouge-2']['p'], scores['rouge-2']['r'], scores['rouge-2']['f']],
    'rougeL': [scores['rouge-l']['p'], scores['rouge-l']['r'], scores['rouge-l']['f']],
}, index=[ 'P', 'R', 'F'])
df.style.format({'rouge1': "{:.4f}", 'rouge2': "{:.4f}", 'rougeL': "{:.4f}"})

Unnamed: 0,rouge1,rouge2,rougeL
P,0.6292,0.2003,0.5882
R,0.0948,0.0206,0.0874
F,0.158,0.0355,0.1461
