In [3]:
import ipywidgets as widgets
import pandas as pd
import json
import torch

from datasets import load_dataset

with open("content/mergedJson.json", "r") as jsonl_file:
    lines = jsonl_file.readlines()

data_list = []
for line in lines:
    data = json.loads(line)
    data_list.append(data)

dataframe = pd.DataFrame(data_list, columns=["GENERATED_DESCRIPTION", "GENERATED_DATA"])

dataframe

JSONDecodeError: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)

In [2]:
# From https://huggingface.co/docs/transformers/model_doc/t5#training
# and
# https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

max_source_length = 3072
max_target_length = 5176

# task_prefix = "json2dae: "
input_sequences = dataframe["GENERATED_DESCRIPTION"]

input_sequences = input_sequences.astype(str)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding = "longest",
    max_length = max_source_length,
    truncation = True,
    return_tensors = "pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

input_ids, attention_mask

(tensor([[  3, 354, 739,  ...,   0,   0,   0],
         [  3, 354, 739,  ...,   0,   0,   0],
         [  3, 354, 739,  ...,   0,   0,   0],
         ...,
         [  3, 354, 739,  ...,   0,   0,   0],
         [  3, 354, 739,  ...,   0,   0,   0],
         [  3, 354, 739,  ...,   0,   0,   0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]))

In [7]:
output_sequences = dataframe["GENERATED_DATA"].astype(str).tolist()

target_encoding = tokenizer(
    output_sequences,
    padding="longest",
    max_length = max_target_length,
    truncation = True,
    return_tensors = "pt",
)

labels = target_encoding.input_ids
labels

tensor([[   3,    2,   31,  ...,    2,   31,    1],
        [   3,    2,   31,  ..., 2687,  107,    1],
        [   3,    2,   31,  ...,    2,   31,    1],
        ...,
        [   3,    2,   31,  ...,    2,   31,    1],
        [   3,    2,   31,  ...,    2,   31,    1],
        [   3,    2,   31,  ...,    2,   31,    1]])

In [None]:
# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()