In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from src.tokenizer import GPT2NumeralTokenizer
import datasets
from datasets import load_dataset
from src.data import build_tokenize_function
from torch.utils.data import DataLoader
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    default_data_collator,
    get_scheduler,
    DataCollatorWithPadding,
)

In [3]:
model_path = "outputs/star_graph/dpo_gpt2_12x6x384/checkpoint-1500"

In [4]:
# load a base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_path,    
    # device_map={"": accelerator.process_index},
    # torch_dtype=torch.bfloat16,
)

In [5]:
tokenizer = GPT2NumeralTokenizer(
                50,
                padding_side='left'
            )
tokenizer.pad_token_id = tokenizer.eos_token_id



In [12]:
raw_datasets = datasets.load_dataset("nnheui/star-d2_p5_n50")
def tokenize_function(examples, text_column_name="text"):
    input_ids = []
    labels = []
    attention_mask = []
    for line in examples[text_column_name]:
        prefix, target = line.strip().split('=')
        prefix += "="
        prefix = tokenizer.encode(prefix)
        target = tokenizer.encode(target)
        input_ids.append(prefix)
        labels.append(target)
        attention_mask.append([1] * len(prefix))
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "targets": labels,
    }

column_names = raw_datasets["train"].column_names

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
)

Map: 100%|██████████| 200000/200000 [00:32<00:00, 6109.18 examples/s]
Map: 100%|██████████| 20000/20000 [00:03<00:00, 6112.80 examples/s]


In [13]:
print(tokenizer.decode(tokenized_datasets["train"][1]["input_ids"]))
print(tokenized_datasets["train"][1]["targets"])

47,46|31,29|29,23|43,5|23,47|41,43|44,41|31,44/31,46=
[31, 29, 23, 47, 46]


In [14]:
train_dataloader = DataLoader(tokenized_datasets["train"].select(list(range(10000))), shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer), batch_size=256)
val_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer), batch_size=256)

In [15]:
next(iter(train_dataloader))

{'input_ids': tensor([[49, 13, 50,  ..., 46, 22, 51],
        [47, 46, 50,  ..., 31, 46, 51],
        [ 7,  6, 50,  ...,  7,  1, 51],
        ...,
        [15, 20, 50,  ..., 15, 32, 51],
        [ 4, 25, 50,  ..., 32, 36, 51],
        [ 1, 46, 50,  ..., 18, 46, 51]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'targets': tensor([[46, 35, 32, 42, 22],
        [31, 29, 23, 47, 46],
        [ 7,  6, 31, 14,  1],
        ...,
        [15, 20, 47, 12, 32],
        [32, 23, 22, 45, 36],
        [18, 32, 16,  1, 46]])}

In [16]:
def generate(eval_dataloader, model, top_k=None):
    model.eval()
    all_generated = []
    all_labels = []
    all_token_probs = []
    all_topk_idx = []
    all_topk_value = []
    for step, batch in enumerate(eval_dataloader):
        labels = batch['targets']
        num_target_tokens = labels.shape[1]
        with torch.no_grad():
            outputs = model.generate(
                input_ids = batch["input_ids"].to("cuda:2"),
                attention_mask = batch["attention_mask"].to("cuda:2"),
                max_new_tokens=num_target_tokens,
                do_sample=False,
                output_logits=True,
                return_dict_in_generate=True, 
                pad_token_id=tokenizer.eos_token_id
            )
            generated_ids = outputs.sequences[:,-num_target_tokens:]
            logits = outputs.logits
            logits = torch.stack(logits, dim=1)
            probs = torch.softmax(logits, dim=-1)
            token_probs = probs.gather(-1, generated_ids.unsqueeze(-1)).squeeze(-1)
            generated_ids = generated_ids[:, -num_target_tokens:].cpu()

            topk = probs[:, 0].topk(3)
        all_generated.append(generated_ids)
        all_labels.append(labels)
        all_token_probs.append(token_probs)
        all_topk_idx.append(topk.indices)
        all_topk_value.append(topk.values)
    return {
        "generated": torch.cat(all_generated, dim=0),
        "labels": torch.cat(all_labels, dim=0),
        "token_probs": torch.cat(all_token_probs, dim=0),
        "topk_idx": torch.cat(all_topk_idx, dim=0),
        "topk_value": torch.cat(all_topk_value, dim=0),
    }

In [17]:
model = model.to("cuda:2")
outputs = generate(train_dataloader, model)
# outputs = generate(val_dataloader, model)

In [18]:
processed_all_generated = outputs['generated']
processed_all_labels = outputs["labels"]

mask = (processed_all_labels != tokenizer.eos_token_id)
correct = (processed_all_generated == processed_all_labels) * mask
print((correct.sum(dim=1) == mask.sum(dim=1)).float().mean())
print(correct.sum(dim=0) / (mask.sum(dim=0) + 1e-9))

tensor(0.3577)
tensor([1.0000, 0.4587, 0.4205, 0.3994, 1.0000])


In [19]:
idx = 4
print(tokenizer.decode(tokenized_datasets["train"][idx]["input_ids"]))
print(processed_all_generated[idx], processed_all_labels[idx])

print(outputs['token_probs'][idx])
print(outputs["topk_idx"][idx])
print(outputs["topk_value"][idx])

38,4|26,29|32,45|45,16|2,32|23,26|29,38|23,2/23,16=
tensor([23, 26, 29, 38, 16]) tensor([23,  2, 32, 45, 16])
tensor([0.9399, 0.4061, 0.6869, 0.6768, 0.9393], device='cuda:2')
tensor([23, 37, 26], device='cuda:2')
tensor([9.3989e-01, 6.3950e-04, 6.1606e-04], device='cuda:2')
