In [1]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from evaluate import evaluator
import evaluate
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel
from datasets import DatasetDict, load_dataset
import numpy as np
from init_parameters import init_parameters
from data import split_data, set_seed, k_split
import torch
import random

In [2]:
data_name = 'bigbench'
task = 'elementary_math_qa'
seed = 42
num_clients = 10
num_error_clients = 2
number = 0
model_name_or_path = 'google/flan-t5-base'
metric = evaluate.load("sacrebleu")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
set_seed(seed)
if data_name == 'bigbench':
    dataset = load_dataset("tasksource/bigbench", task).shuffle(seed=seed)
    dataset = dataset.rename_columns({'inputs':'source','targets':'target'})
else:
    dataset = load_dataset("JsSparkYyx/NLP524", task).shuffle(seed=seed)

In [3]:
train_ds = k_split(num_clients,num_error_clients,dataset['train'])
if data_name == 'glue':
    test_ds = k_split(num_clients,num_error_clients,dataset['test'])
    valid_ds = k_split(num_clients,num_error_clients,dataset['valid'])
    dataset = DatasetDict({'train':train_ds[number],'test':test_ds[number],'valid':valid_ds[number]})
else:
    test_ds = None
    valid_ds = k_split(num_clients,num_error_clients,dataset['validation'])
    dataset = DatasetDict({'train':train_ds[number],'valid':valid_ds[number]})

In [4]:
# a = dataset['train']['target']
# b = dataset['train']['source']
# c = Dataset.from_dict({'source':b,'target':a})
# print(c['target'])
# random.shuffle(a)
# from datasets import Dataset
# c = Dataset.from_dict({'source':b,'target':a})
# print(c['target'])
print(dataset['train']['target'])
print(dataset['train']['source'])

[['14'], ['51'], ['800'], ['9'], ['14'], ['132'], ['75'], ['7'], ['$ 110'], ['90'], ['2'], ['120'], ['4'], ['14'], ['12'], ['4'], ['450'], ['140'], ['8'], ['398'], ['120000'], ['40 %'], ['265'], ["' 24'"], ['80'], ['480'], ['63'], ['10 .'], ['50 seconds'], ['4'], ['8'], ['3 : 2'], ['14.5 minutes'], ['3 : 5'], ['$ 9.00'], ['1250'], ['520'], ['5 kmph'], ['$ 960'], ['56'], ['50 m'], ['57'], ['$ 2570'], ['470'], ['69 kmph'], ['184'], ['60'], ['2300'], ['251'], ['30'], ['2'], ['32'], ['30 kmph'], ['9'], ['42'], ['$ 3.85'], ['5 / 32'], ['3'], ['3400'], ['1400'], ['50 %'], ['90'], ['7'], ['9 hours'], ['20'], ['4 hrs'], ['70'], ['92'], ['26'], ['36'], ['16'], ['3'], ['25'], ['80 kg'], ['700'], ['10 %'], ['855'], ['50'], ['1.4'], ['191'], ['484'], ["' 36']"], ['12 metres'], ['370'], ['$ 252'], ['1'], ['33'], ['140 m'], ['16'], ['24'], ['5'], ['90'], ['240'], ['72.5 kg'], ['8'], ['15.5 minutes'], ['18'], ['2'], ['400'], ['35 days'], ['20'], ['20'], ['37'], ['675'], ['19'], ['$ 4'], ['5'], ['800'

In [15]:
def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    model_inputs = tokenizer(examples['source'], truncation=True, max_length=None,padding=True,return_tensors='pt')
    if data_name == 'glue':
        model_inputs['labels'] = tokenizer(examples['target'], truncation=True, max_length=None,padding=True,return_tensors='pt')["input_ids"]
    else:
        model_inputs['labels'] = tokenizer([_[0] for _ in examples['target']], truncation=True, max_length=None,padding=True,return_tensors='pt')["input_ids"]
    return model_inputs
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names)

In [22]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, return_dict=True)
model_name = model_name_or_path.split("/")[-1]
# lora_model = PeftModel.from_pretrained(model,f'JsSparkYyx/flan-t5-base-finetuned-lora-{task}-{number}')
lora_model = PeftModel.from_pretrained(model,f'JsSparkYyx/flan-t5-base-finetuned-lora-{task}-1')
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, return_dict=True)

Downloading adapter_config.json:   0%|          | 0.00/497 [00:00<?, ?B/s]

Downloading (…)er_model.safetensors:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

In [23]:
def accuracy_score(outputs, ground_truths):
    correct = 0
    total = 0
    for output, truth in zip(outputs, ground_truths):
        if data_name == "bigbench":
            truth = truth[0]
        if output.strip().lower().replace(".", "") == truth.strip().lower().replace(".", ""):
            correct += 1
        total += 1
    return correct / total * 100

In [24]:
from tqdm import trange
example_predictions = []
example_predictions_lora = []
# load model

# use gpu if available
eval_set = "test" if data_name == 'glue' else "valid"
batch_size = 64
device = "cuda" if torch.cuda.is_available() else "cpu"
lora_model.to(device)
base_model.to(device)
with torch.no_grad():
    for i in trange(0, len(dataset[eval_set]["source"]), batch_size):
        inputs = tokenizer(
                dataset[eval_set]["source"][i : i + batch_size],
                max_length=2048,
                return_tensors="pt",
                padding=True,
            ).to(device)
        outputs = base_model.generate(
            input_ids=inputs["input_ids"], max_new_tokens=256
        )
        outputs = tokenizer.batch_decode(
            outputs.to("cpu"), skip_special_tokens=True
        )
        example_predictions.extend(outputs)
        outputs = lora_model.generate(
            input_ids=inputs["input_ids"], max_new_tokens=256
        )
        outputs = tokenizer.batch_decode(
            outputs.to("cpu"), skip_special_tokens=True
        )
        example_predictions_lora.extend(outputs)

task_perf = accuracy_score(example_predictions, dataset[eval_set]["target"])
task_perf_lora = accuracy_score(example_predictions_lora, dataset[eval_set]["target"])

100%|██████████| 12/12 [00:41<00:00,  3.46s/it]


In [25]:
print(task_perf)
print(task_perf_lora)

5.5045871559633035
31.061598951507207


In [21]:
print(task_perf)
print(task_perf_lora)

5.5045871559633035
29.882044560943644


In [9]:

device = "cuda" if torch.cuda.is_available() else "cpu"
lora_model.to(device)
base_model.to(device)
inputs = tokenizer(
        dataset['valid']['source'][:5],
        max_length=2048,
        return_tensors="pt",
        padding=True,
    ).to(device)
outputs = base_model.generate(
        input_ids=inputs["input_ids"], max_new_tokens=256
    )
outputs = tokenizer.batch_decode(
        outputs.to("cpu"), skip_special_tokens=True
    )
print(dataset['valid']['source'][:5])
print(outputs)
print(dataset['valid']['target'][:5])

['What is the result of the following arithmetic operations?:subtract(26, 2)\n choice:21\n choice:24\n choice:22\n choice:20\n choice:23\nA:', 'What is the answer to the following math word problem?:there are 1200 jelly beans divided between two jars , jar x and jar y . if there are 400 fewer jelly beans in jar x than 3 times the number of beans in jar y , how many beans are in jar x ?\n choice:700\n choice:850\n choice:750\n choice:650\n choice:800\nA:', 'What is the answer to the following math word problem, with the given hint?:6 x – 5 y + 3 z = 22 4 x + 8 y – 11 z = 7 5 x – 6 y + 2 z = 12 given the equations above , x + y + z = ?\nmultiply(5, 2)\n choice:12\n choice:15\n choice:13\n choice:10\n choice:14\nA:', 'What is the result of the following arithmetic operations?:subtract 15.5 from 15.8,  subtract 15.8 from 16.4,  and then divide both.\n choice:2 : 1\n choice:2 : 6\n choice:2 : 3\n choice:1 : 2\n choice:2 : 5\nA:', 'What is the result of the following arithmetic operations?:d

In [10]:
print(tokenized_datasets['train']['input_ids'][5])

[363, 19, 8, 741, 13, 8, 826, 3, 9, 30922, 51, 7578, 2673, 58, 10, 23829, 102, 120, 1902, 1714, 12, 11558, 14514, 741, 57, 3, 17225, 6, 1160, 10, 11434, 1160, 10, 16975, 1160, 10, 5426, 1160, 10, 15239, 1160, 10, 17225, 71, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
outputs = model.generate(
        input_ids=dataset['train']['source'][:5], max_new_tokens=256
    )

AttributeError: 'list' object has no attribute 'shape'

In [None]:
labels = tokenized_datasets['train']['labels'][:5]
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
tokenizer.batch_decode(labels, skip_special_tokens=True)

['yes', 'yes', 'no', 'no', 'no']