In [1]:
# %%capture
# !pip install transformers torch datasets
# !git clone https://github.com/BerkeleyLearnVerify/Scenic.git

In [2]:
from glob import glob
from typing import Dict, List
import datasets
from torch.utils.data import Dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
base_path = "./Scenic"

scenic_files = glob(base_path + "/**/*.scenic", recursive=True)
source_files = glob(base_path + "/**/*.py", recursive=True)
doc_files = glob(base_path + "/**/*.rst", recursive=True)

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map = 'auto')
model = AutoModelForCausalLM.from_pretrained(model_name, device_map = 'auto', offload_folder="offload")

Loading checkpoint shards: 100%|██████████| 2/2 [00:32<00:00, 16.05s/it]


In [4]:
def generate_description_from_scenic(scenic : str, verbose : bool = False) -> str:
    """Utility function to generate a description from a scenic file"""
    input_ids = tokenizer.encode(scenic, return_tensors="pt")
    output = model.generate(input_ids, max_new_tokens=2000, do_sample=False, top_k=0, top_p=0.9) # do_sample=False <==> temperature=0.0
    generated = tokenizer.decode(output, skip_special_tokens=True)
    if verbose:
        print(generated)
    return generated


def get_files(files : List[str], debug : bool = False) -> List[Dict[str, str]]:
    """Utility function to read a list of files and return
    a list of dictionaries with the file path and content"""

    out_files = []

    for path in files:
        if debug:
            print(path)

        with open(path, 'rb') as f:
            content = f.read()
            content = content.decode('utf-8')
            path = path.split("/Scenic/")[1]
            file_dict = {}
            file_dict['path'] = path
            file_dict['description'] = generate_description_from_scenic(content)
            file_dict['scenic'] = content
            # let's pretty print the content so that new lines are actually printed
            # and not just \n
            print("--------------------")
            print(content)
            print("--------------------")

            out_files.append(file_dict)

    return out_files

In [5]:
def build_dataset(
        scenic_examples : list,
        source_examples : list | None = None,
        doc_examples : list | None = None,
        train_split : float = 0.8,
        verbose : bool = False,
    ) -> (datasets.Dataset, datasets.Dataset):
    """Builds a dataset from the three dictionaries of files"""

    scenic_dataset = {}
    source_dataset = {}
    doc_dataset = {}
    train_dataset = []
    test_dataset = []

    if verbose:
        print("total number of scenic files: ", len(scenic_examples))

    scenic_split_index = int(len(scenic_examples) * train_split)
    scenic_dataset['train'] = scenic_examples[:scenic_split_index]
    scenic_dataset['test'] = scenic_examples[scenic_split_index:]
    train_dataset.extend(scenic_dataset['train'])
    test_dataset.extend(scenic_dataset['test'])

    # until we know how to bias the logits to do prediction/learning
    # on the source and doc files, we will not include them in the dataset
    if source_examples is not None:
        source_split_index = int(len(source_examples) * train_split)
        source_dataset['train'] = source_examples[:source_split_index]
        source_dataset['test'] = source_examples[source_split_index:]
        train_dataset.extend(source_dataset['train'])
        test_dataset.extend(source_dataset['test'])

    if doc_examples is not None:
        doc_split_index = int(len(doc_examples) * train_split)
        doc_dataset['train'] = doc_examples[:doc_split_index]
        doc_dataset['test'] = doc_examples[doc_split_index:]
        train_dataset.extend(doc_dataset['train'])
        test_dataset.extend(doc_dataset['test'])

    if verbose:
        print("total number of training examples: ", len(train_dataset))
        print("total number of testing examples: ", len(test_dataset))

    return train_dataset, test_dataset

In [6]:
def generate_dataset_from_scratch(
        tokenizer : PreTrainedTokenizerBase,
        include_source : bool = False,
        include_docs : bool = False,
        verbose : bool = False
    ) -> (datasets.Dataset, datasets.Dataset):
    scenic_examples = get_files(scenic_files, debug=verbose)
    source_examples = get_files(source_files, debug=verbose) if include_source else None
    doc_examples = get_files(doc_files, debug=verbose) if include_docs else None
    train, test = build_dataset(scenic_examples, source_examples, doc_examples)

    train_dataset = ScenicFineTuneDataset(train, tokenizer)
    test_dataset = ScenicFineTuneDataset(test, tokenizer)

    return train_dataset, test_dataset


class ScenicFineTuneDataset(Dataset):
    PROMPT = "<s>[INST] Generate a scene from the following description:" + \
        "\n\n Description: {description}\n\n" + \
        "Scenic: [\INST]{scenic}"

    def __init__(
            self, 
            data: List, 
            tokenizer : PreTrainedTokenizerBase,
            verbose : bool = False,
        ):
        self.data = data
        self.tokenizer = tokenizer

        tokenizer.padding_side = "right"

        self.input_ids = []
        self.attention_masks = []
        self.labels = []

        training_texts = []
        for example in self.data:
            training_text = ScenicFineTuneDataset.PROMPT.format(description=example['description'], scenic=example['scenic']) + "<|endoftext|>" # include the end token so model knows when to stop!
            training_texts.append(training_text)

        encodings_dict = tokenizer(training_texts, truncation=True, padding=True)
        for i, (example, training_text) in enumerate(zip(self.data, training_texts)):
            if verbose and i % 10 == 0:
                print("training text =", training_text)
            self.input_ids.append(torch.tensor(encodings_dict['input_ids'][i]))
            self.attention_masks.append(torch.tensor(encodings_dict['attention_mask'][i]))

            # here is the magic where we label the parts to ignore (like the prompt) with -100 so it's ignored as loss
            prompt_len = len(self.tokenizer.encode(ScenicFineTuneDataset.PROMPT.format(description=example['description'], scenic='').strip()))
            whole_input_len = len(self.tokenizer.encode(training_text))
            max_len_incl_padding = len(encodings_dict['input_ids'][i])
            label = [-100] * prompt_len + encodings_dict['input_ids'][i][prompt_len:whole_input_len] + [-100] * (max_len_incl_padding - whole_input_len)
            self.labels.append(torch.tensor(label))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

In [7]:
train_dataset, test_dataset = generate_dataset_from_scratch(AutoTokenizer.from_pretrained("gpt2"), verbose=True) # needs to be openai tokenizer
train_dataset.save_to_disk('train_dataset_ft')
test_dataset.save_to_disk('test_dataset_ft')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


./Scenic/examples/carla/pedestrian.scenic




OutOfMemoryError: CUDA out of memory. Tried to allocate 500.00 MiB. GPU 0 has a total capacty of 23.64 GiB of which 498.25 MiB is free. Process 686357 has 5.49 GiB memory in use. Process 716078 has 3.21 GiB memory in use. Including non-PyTorch memory, this process has 14.43 GiB memory in use. Of the allocated memory 13.65 GiB is allocated by PyTorch, and 23.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF