In [None]:
!pip install datasets ipywidgets
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate

import ipywidgets as widgets
import pandas as pd
import json
import torch

from datasets import load_dataset

In [None]:
# Upload the mergedJson.jsonl zip file, to preserve contents
!unzip /content/mergedJson.jsonl.zip
!sha1sum /content/mergedJson.jsonl
# Should be 59df712cdc761401e4a44b3c17aaa48e18c0a214

In [68]:
with open("/content/mergedJson.jsonl", "r") as jsonl_file:
    lines = jsonl_file.readlines()

data_list = []
for line in lines:
    data = json.loads(line)
    data_list.append(data)

dataframe = pd.DataFrame(data_list, columns=["GENERATED_DESCRIPTION", "GENERATED_DATA"])

dataframe

Unnamed: 0,GENERATED_DESCRIPTION,GENERATED_DATA
0,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
1,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
2,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
3,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
4,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
...,...,...
995,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
996,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
997,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
998,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...


In [None]:
dataframe[:2]

In [92]:
from datasets import Dataset
from datasets.dataset_dict import DatasetDict
dataframe_train = Dataset.from_pandas(dataframe[:600])
dataframe_validate = Dataset.from_pandas(dataframe[600:800])
dataframe_test = Dataset.from_pandas(dataframe[800:])

dataset3 = DatasetDict({"train": dataframe_train, "validate": dataframe_validate, "test": dataframe_test})
print(dataset3)

DatasetDict({
    train: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 600
    })
    validate: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 200
    })
    test: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 200
    })
})


In [93]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

max_source_length = 512
max_target_length = 512

# task_prefix = "json2dae: "


In [None]:
output_sequences = dataset3["train"]["GENERATED_DATA"]
output_sequences = [str(seq) for seq in output_sequences]
labels = tokenizer(
    output_sequences,
    max_length = max_target_length,
    padding = "max_length",
    truncation = True
)

In [88]:
type(labels["input_ids"])

list

In [95]:
def preprocess_data(inputData):
    input_sequences = inputData["GENERATED_DESCRIPTION"]
    input_sequences = [str(seq) for seq in input_sequences]

    output_sequences = inputData["GENERATED_DATA"]
    output_sequences = [str(seq) for seq in output_sequences]

    model_inputs = tokenizer(
        input_sequences,
        max_length = max_source_length,
        padding = "max_length",
        truncation = True
    )

    labels = tokenizer(
        output_sequences,
        max_length = max_target_length,
        padding = "max_length",
        truncation = True
    )

    # Replace padding tokens with -100 so they are
    # ignored by CrossEntropyLoss thing
    #labels_with_ignore_index = []
    #for sample in labels:
    #    sample = [label if label != 0 else -100 for label in sample]
    #    labels_with_ignore_index.append(sample)

    labels_with_ignore_index = []
    for label_sample in labels["input_ids"]:
        label_sample = [label if label != 0 else -100 for label in label_sample]
        labels_with_ignore_index.append(label_sample)

    model_inputs["labels"] = labels_with_ignore_index
    return model_inputs



mapped_dataset = dataset3.map(preprocess_data, batched=True, batch_size=2)
mapped_dataset

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 600
    })
    validate: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [None]:
mapped_dataset["test"]["labels"][0]

In [98]:
import pickle

with open('29Apr24_v2_m512_M512_S1000_mapped_dataset.pkl', 'wb') as f:
    pickle.dump(mapped_dataset, f)

!zip 29Apr24_v2_m512_M512_S1000_mapped_dataset.pkl.zip 29Apr24_v2_m512_M512_S1000_mapped_dataset.pkl

updating: 29Apr24_v2_m512_M512_S1000_mapped_dataset.pkl (deflated 91%)


In [99]:
from torch.utils.data import DataLoader
mapped_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
train_dataloader = DataLoader(mapped_dataset["train"], shuffle=True, batch_size=2)
valid_dataloader = DataLoader(mapped_dataset["validate"], batch_size=2)
test_dataloader = DataLoader(mapped_dataset["test"], batch_size=2)

In [None]:
batch = next(iter(train_dataloader))
batch.keys()

In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




In [101]:
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

60,492,288 total parameters.
60,492,288 training parameters.


In [102]:
import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DefaultDataCollator

In [103]:
data_collator = DefaultDataCollator(return_tensors="tf")

mapped_train = mapped_dataset["train"]
mapped_validate = mapped_dataset["validate"]

In [104]:
dataset_training = mapped_train.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    batch_size=2,
    collate_fn=data_collator,
)
dataset_validation = mapped_validate.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=2,
    collate_fn=data_collator,
)

In [105]:
training_args = TrainingArguments(
    output_dir = "/",
    num_train_epochs = 3,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    save_steps = 400,
    save_total_limit = 2,
    eval_steps = 100,
    eval_strategy = "steps",
    fp16 = True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = mapped_dataset["train"],
    eval_dataset = mapped_dataset["validate"]
)

In [106]:
trainer.train()

Step,Training Loss,Validation Loss
100,No log,0.139681
200,No log,0.066477
300,No log,0.045861
400,No log,0.044614
500,0.321900,0.039111
600,0.321900,0.038932
700,0.321900,0.03844
800,0.321900,0.038738
900,0.321900,0.036999


TrainOutput(global_step=900, training_loss=0.19647268507215712, metrics={'train_runtime': 151.09, 'train_samples_per_second': 11.913, 'train_steps_per_second': 5.957, 'total_flos': 243615242649600.0, 'train_loss': 0.19647268507215712, 'epoch': 3.0})

In [107]:
trainer.save_model("29Apr24_v1")
!zip -r 29Apr24_model_v1.zip 29Apr24_v1

In [114]:
# @title Testing

from transformers import T5ForConditionalGeneration, T5Tokenizer
model_path = "29Apr24_v1"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')

def translator(model, tokenizer):
  inputs = """
  {
    "large_cube_location": "<Vector (0.0000, 0.0000, 10.5000)>",
    "large_cube_size": "<Vector (11.0000, 24.0000, 21.0000)>",
    "Object0_location": "<Vector (-0.5000, 5.0000, 0.5000)>",
    "Object0_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object1_location": "<Vector (-3.5000, -4.0000, 0.5000)>",
    "Object1_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object2_location": "<Vector (1.5000, -3.0000, 0.5000)>",
    "Object2_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object3_location": "<Vector (-0.5000, 8.0000, 0.5000)>",
    "Object3_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object4_location": "<Vector (4.5000, -4.0000, 0.5000)>",
    "Object4_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object5_location": "<Vector (2.5000, -3.0000, 0.5000)>",
    "Object5_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object6_location": "<Vector (-0.5000, -8.0000, 0.5000)>",
    "Object6_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object7_location": "<Vector (2.5000, -6.0000, 0.5000)>",
    "Object7_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object8_location": "<Vector (2.5000, 3.0000, 0.5000)>",
    "Object8_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object9_location": "<Vector (-3.5000, 3.0000, 0.5000)>",
    "Object9_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object10_location": "<Vector (1.5000, -11.0000, 0.5000)>",
    "Object10_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object11_location": "<Vector (-1.5000, 0.0000, 0.5000)>",
    "Object11_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object12_location": "<Vector (2.5000, 11.0000, 0.5000)>",
    "Object12_size": "<Vector (1.0000, 1.0000, 1.0000)>"
  }
  """
  input_tokens = tokenizer.encode(
      inputs,
      return_tensors = "pt",
      max_length = 512,
      truncation = True
  )

  corrected_ids = model.generate(
      input_tokens,
      max_length = 1024
  )

  corrected_text = tokenizer.decode(corrected_ids[0], skip_special_tokens = True)
  return corrected_text

tokenstuff = translator(model, tokenizer)
print(tokenstuff)

{'COLLADA': {'@version': "1.4.1", '@xmlns': "http://www.collada.org/2005/11/COLLADASchema", '@xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance", "asset": {'contributor': {'author': "Blender User", "authoring_tool": "Blender 4.0.2 commit date:2023-12-05, commit time:07:41, hash:9be62e85b727"}, "created": "2024-03-25T12:20:29', "modified": "2024-03-25T12:20:29", "unit": {'@meter": "1", '@name': "meter"}, "up_axis": "Z_UP"}, "library_geometries": {"geometry": [{"@id": "Cube_5649-mesh", '@name": "Cube.5649", "mesh": {'source': [{"@id": "Cube_5649-mesh-positions", "float_array": {'#text': "-0.5 -0.5 -0.5 -0.5 0.5 -0.5 0.5 -0.5 -0.5 -0.5 0.5 0.5 -0.5 -0.5 0.5 -0.5 0.5 0.5 0.5 0.5 -0.5 0.5 0.5 0.5", '@count': "24", '@id': "Cube_5649-mesh-positions-array"}, "technique_common": {'accessor': {'@count': "8", '@source': '#Cube_5649-mesh-positions-array", '@stride': "3", "param": [{"@name": "X", '@type": "float"}, {'@name": "Y", '@type": "float"}, {'@name": "Z", '@type": "float"}]}}}, {'@id":