In [1]:
import h5py
import os
import boto3
import shutil
import fire
import time
import torch
import transformers
import hydra
import numpy as np
import pandas as pd
from hydra.utils import get_original_cwd
from omegaconf import DictConfig
from datasets import load_dataset
import datasets
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler

In [2]:
device = 'cuda'
local_path = os.path.normpath(os.getcwd() + os.sep + os.pardir)

# GPT2 and OPT tokenizer test

In [11]:
# GPT2

In [12]:
#tokenizer GPT2
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',do_lower_case=False, truncation_side='left')

special_tokens = {
    "additional_special_tokens": [
        "<TITLE_START>",
        "<TITLE_END>",
        "<INSTR_START>",
        "<NEXT_INSTR>",
        "<INSTR_END>",
        "<INGR_START>",
        "<NEXT_INGR>",
        "<INGR_END>",
        "<RECIPE_START>",
        "<RECIPE_END>",
        "<INPUT_START>",
        "<INPUT_END>",
        "<NEXT_INPUT>"
    ]
}

tokenizer.add_special_tokens(special_tokens)
# tokenizer.pad_token_id = (
#     0  # unk. we want this to be different from the eos token
# )
# tokenizer.padding_side = "right"  # Left: Allows batched inference, we put right for this task.


13

In [13]:
path_gpt2 = local_path + "/data/unsupervised.h5"

In [None]:
with h5py.File(path_gpt2, 'r') as f:
    data_np = f['train'][:]
    train_dataset = torch.tensor(f['train'][:]).to(device)

In [None]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=2)

In [None]:
line = train_dataloader.dataset[0]

In [31]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '!', 'additional_special_tokens': ['<TITLE_START>', '<TITLE_END>', '<INSTR_START>', '<NEXT_INSTR>', '<INSTR_END>', '<INGR_START>', '<NEXT_INGR>', '<INGR_END>', '<RECIPE_START>', '<RECIPE_END>', '<INPUT_START>', '<INPUT_END>', '<NEXT_INPUT>']}, clean_up_tokenization_spaces=True)

In [32]:
tokenizer.decode(line, clean_up_tokenization_spaces=True)

'<RECIPE_START> <INPUT_START> balsamic vinegar <NEXT_INPUT> sugar <NEXT_INPUT> water <NEXT_INPUT> watermelon <NEXT_INPUT> mint <INPUT_END> <INGR_START> 1/2 cup good balsamic vinegar <NEXT_INGR> 1/4 cup sugar <NEXT_INGR> 1/4 cup water <NEXT_INGR> 1/2 large watermelon <NEXT_INGR> 6 fresh mint leaves, julienned <INGR_END> <INSTR_START> Combine balsamic, sugar and water in saucepan. <NEXT_INSTR> Heat over medium-high heat and reduce slightly until syrup consistency, about 12 minutes. <NEXT_INSTR> Cool to room temperature. <NEXT_INSTR> Cut up watermelon in wedges. <NEXT_INSTR> Drizzle cooled balsamic syrup over watermelon. <NEXT_INSTR> Garnish with mint. <INSTR_END> <TITLE_START> Watermelon with Sweet Balsamic Syrup and Fresh Mint <TITLE_END> <RECIPE_END> <RECIPE_START> <INPUT_START> brown sugar <NEXT_INPUT> margarine <NEXT_INPUT> butter <NEXT_INPUT> crackers <NEXT_INPUT> chocolate chips <INPUT_END> <INGR_START> 3/4 c. brown sugar <NEXT_INGR> 1 stick margarine <NEXT_INGR> 1 stick butter <NE

In [4]:
#tokenizer OPT
model_path = local_path + "/checkpoints/opt/checkpoint-opt-final"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, do_lower_case=False, truncation_side='left')
max_token_len = tokenizer.max_model_input_sizes["gpt2"]

special_tokens = {
    "additional_special_tokens": [
        "<TITLE_START>",
        "<TITLE_END>",
        "<INSTR_START>",
        "<NEXT_INSTR>",
        "<INSTR_END>",
        "<INGR_START>",
        "<NEXT_INGR>",
        "<INGR_END>",
        "<RECIPE_START>",
        "<RECIPE_END>",
        "<INPUT_START>",
        "<INPUT_END>",
        "<NEXT_INPUT>"
    ]
}

tokenizer.add_special_tokens(special_tokens)

0

In [6]:
path_opt = local_path + "/data/unsupervised_opt.h5"

In [7]:
with h5py.File(path_opt, 'r') as f:
    data_np = f['train'][:]
    train_dataset = torch.tensor(f['train'][:]).to(device)

In [8]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=2)

In [9]:
line = train_dataloader.dataset[0]

In [10]:
tokenizer.decode(line, clean_up_tokenization_spaces=True)

'<RECIPE_START> <INPUT_START> sweet cherries <NEXT_INPUT> strawberries <NEXT_INPUT> white sugar <NEXT_INPUT> lemon juice <INPUT_END> <INGR_START> 1 1/2 cups sweet cherries <NEXT_INGR> 1 1/2 cups strawberries <NEXT_INGR> 1 cup white sugar <NEXT_INGR> 1 tablespoon lemon juice <INGR_END> <INSTR_START> Wash fruit. <NEXT_INSTR> Remove stems fro strawberries and cut up into large chunks. <NEXT_INSTR> Remove pits from cherries and cut cherries into halves. <NEXT_INSTR> Crush fruit- now you should have about 2 cups of fruit total ( once crushed.). <NEXT_INSTR> Place everything into the breadmaker - JAM CYCLE. <NEXT_INSTR> When cycle is finished, pour immediately into glass container (covered). and place into refridgerator. Note that the jam comes out runny but eith thicken up as it cools. <NEXT_INSTR> Store in refridgerator. <INSTR_END> <TITLE_START> Cherry Strawberry Preserves For The Bread Machine (Abm) <TITLE_END> <RECIPE_END> \n <RECIPE_START> <INPUT_START> sweetbreads <NEXT_INPUT> milk <N

# Test Llama tokenizer and h5 dataset to use huggingface Trainer

In [3]:
#tokenizer Llama
from transformers import LlamaForCausalLM, LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained(
    'decapoda-research/llama-7b-hf',
    do_lower_case=False,
    truncation_side='left'
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [4]:
special_tokens = {
    "additional_special_tokens": [
        "<TITLE_START>",
        "<TITLE_END>",
        "<INSTR_START>",
        "<NEXT_INSTR>",
        "<INSTR_END>",
        "<INGR_START>",
        "<NEXT_INGR>",
        "<INGR_END>",
        "<RECIPE_START>",
        "<RECIPE_END>",
        "<INPUT_START>",
        "<INPUT_END>",
        "<NEXT_INPUT>"
    ]
}

tokenizer.add_special_tokens(special_tokens)
tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "right"  # Left: Allows batched inference, we put right for this task.

In [5]:
local_path = os.path.normpath(os.getcwd() + os.sep + os.pardir)
path = local_path + "/data/unsupervised_llama.h5"

In [6]:
with h5py.File(path, 'r') as f:
    print(f.keys())
    raw_data = f['train']
    train_dataset = torch.tensor(f['train'][:])

<KeysViewHDF5 ['test', 'train']>


In [7]:
train_dataset.shape

torch.Size([319961, 2048])

In [16]:
raw_data

<Closed HDF5 dataset>

In [25]:
datta = datasets.Dataset({'train':train_dataset})

TypeError: Expected a pyarrow.Table or a datasets.table.Table object, but got {'train': tensor([[32008, 32010,   289,  ...,     0,     0,     0],
        [32008, 32010,  1205,  ...,     0,     0,     0],
        [32008, 32010,   330,  ...,     0,     0,     0],
        ...,
        [32008, 32010, 18423,  ...,     0,     0,     0],
        [32008, 32010,   796,  ...,     0,     0,     0],
        [32008, 32010,   367,  ...,     0,     0,     0]])}.

In [20]:
dset = datasets.DatasetDict({'train':train_dataset}, features=['input', 'label'])

In [21]:
dset.features

AttributeError: 'DatasetDict' object has no attribute 'features'

In [8]:
dset_dict = {
    'train': [train_dataset]
}

In [None]:
px = pd.DataFrame(train_dataset)

In [None]:
px

In [9]:
dset = load_dataset(raw_data)

TypeError: expected str, bytes or os.PathLike object, not Dataset

In [8]:
train_sampler = RandomSampler(train_dataset)

In [9]:
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=2)

In [10]:
train_dataloader.dataset[0]

tensor([32008, 32010,   289,  ...,     0,     0,     0])

In [11]:
line = train_dataloader.dataset[0]

In [19]:
tokenizer.decode(line, clean_up_tokenization_spaces=True)

'<RECIPE_START> <INPUT_START> balsamic vinegar <NEXT_INPUT> sugar <NEXT_INPUT> water <NEXT_INPUT> watermelon <NEXT_INPUT> mint <INPUT_END> <INGR_START> 1/2 cup good balsamic vinegar <NEXT_INGR> 1/4 cup sugar <NEXT_INGR> 1/4 cup water <NEXT_INGR> 1/2 large watermelon <NEXT_INGR> 6 fresh mint leaves, julienned <INGR_END> <INSTR_START> Combine balsamic, sugar and water in saucepan. <NEXT_INSTR> Heat over medium-high heat and reduce slightly until syrup consistency, about 12 minutes. <NEXT_INSTR> Cool to room temperature. <NEXT_INSTR> Cut up watermelon in wedges. <NEXT_INSTR> Drizzle cooled balsamic syrup over watermelon. <NEXT_INSTR> Garnish with mint. <INSTR_END> <TITLE_START> Watermelon with Sweet Balsamic Syrup and Fresh Mint <TITLE_END> <RECIPE_END> <RECIPE_START> <INPUT_START> brown sugar <NEXT_INPUT> margarine <NEXT_INPUT> butter <NEXT_INPUT> crackers <NEXT_INPUT> chocolate chips <INPUT_END> <INGR_START> 3/4 c. brown sugar <NEXT_INGR> 1 stick margarine <NEXT_INGR> 1 stick butter <NE

In [16]:
data_np.shape

(319961, 2048)

In [41]:
dataset = datasets.Dataset.from_pandas(train_dataset)

ValueError: Dataset(s) incompatible with Pandas data types, not table, or no datasets found in HDF5 file.

In [None]:
data_files = {"train": byte}

In [1]:
train_dataloader

NameError: name 'train_dataloader' is not defined

In [25]:
import datasets
from datasets import Dataset
import pandas as pd
df_train = pd.read_hdf(path)
sentences = datasets.DatasetDict(
    {
        "train": Dataset.from_pandas(df_train)
    }
)

ValueError: Dataset(s) incompatible with Pandas data types, not table, or no datasets found in HDF5 file.