In [None]:
!pip install datasets
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate

import ipywidgets as widgets
import pandas as pd
import json
import torch

from datasets import load_dataset

In [None]:
# GOOGLE COLAB
# Upload the mergedJson.json zip file, to preserve contents
!unzip content/mergedJson.zip
!sha1sum content/mergedJson.json
# Should be

In [None]:
with open("/content/mergedJson.json", "rb") as jsonl_file:
    data_list = json.load(jsonl_file)

In [21]:
# LOCAL
with open("./content/mergedJson.json", "rb") as jsonl_file:
    data_list = json.load(jsonl_file)

In [22]:
data_list.keys()
len(data_list["GENERATED_DESCRIPTION"])

1188

In [23]:
len(data_list["GENERATED_DATA"])

1197

In [24]:
from datasets import Dataset
from datasets.dataset_dict import DatasetDict

del data_list["GENERATED_DATA"][len(data_list["GENERATED_DESCRIPTION"]):]

len(data_list["GENERATED_DATA"])


1188

In [25]:
dataframe = pd.DataFrame(data_list, columns=["GENERATED_DESCRIPTION", "GENERATED_DATA"])
dataframe

Unnamed: 0,GENERATED_DESCRIPTION,GENERATED_DATA
0,"{'location': '<Vector (0.0000, 0.0000, 9.0000)...","b'<library_geometries>\n <geometry id=""Obje..."
1,"{'location': '<Vector (-3.0000, 4.5000, 0.5000...","b'<library_geometries>\n <geometry id=""room..."
2,"{'location': '<Vector (-1.0000, 5.5000, 0.5000...","b'<library_geometries>\n <geometry id=""Obje..."
3,"{'location': '<Vector (4.0000, -5.5000, 0.5000...","b'<library_geometries>\n <geometry id=""Obje..."
4,"{'location': '<Vector (0.0000, 4.5000, 0.5000)...","b'<library_geometries>\n <geometry id=""Obje..."
...,...,...
1183,"{'location': '<Vector (-1.5000, -3.5000, 0.500...","b'<library_geometries>\n <geometry id=""room..."
1184,"{'location': '<Vector (3.5000, -2.5000, 0.5000...","b'<library_geometries>\n <geometry id=""Obje..."
1185,"{'location': '<Vector (2.5000, 7.5000, 0.5000)...","b'<library_geometries>\n <geometry id=""Obje..."
1186,"{'location': '<Vector (0.5000, 6.5000, 0.5000)...","b'<library_geometries>\n <geometry id=""Obje..."


In [26]:

dataframe_train = Dataset.from_pandas(dataframe[:600])
dataframe_validate = Dataset.from_pandas(dataframe[600:800])
dataframe_test = Dataset.from_pandas(dataframe[800:])

dataset3 = DatasetDict({"train": dataframe_train, "validate": dataframe_validate, "test": dataframe_test})
print(dataset3)

DatasetDict({
    train: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 600
    })
    validate: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 200
    })
    test: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 388
    })
})


In [27]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

max_source_length = 512
max_target_length = 512

# task_prefix = "json2dae: "


In [28]:
output_sequences = dataset3["train"]["GENERATED_DATA"]
output_sequences = [str(seq) for seq in output_sequences]
labels = tokenizer(
    output_sequences,
    max_length = max_target_length,
    padding = "max_length",
    truncation = True
)

In [29]:
def preprocess_data(inputData):
    input_sequences = inputData["GENERATED_DESCRIPTION"]
    input_sequences = [str(seq) for seq in input_sequences]

    output_sequences = inputData["GENERATED_DATA"]
    output_sequences = [str(seq) for seq in output_sequences]

    model_inputs = tokenizer(
        input_sequences,
        max_length = max_source_length,
        padding = "max_length",
        truncation = True
    )

    labels = tokenizer(
        output_sequences,
        max_length = max_target_length,
        padding = "max_length",
        truncation = True
    )

    # Replace padding tokens with -100 so they are
    # ignored by CrossEntropyLoss thing
    #labels_with_ignore_index = []
    #for sample in labels:
    #    sample = [label if label != 0 else -100 for label in sample]
    #    labels_with_ignore_index.append(sample)

    labels_with_ignore_index = []
    for label_sample in labels["input_ids"]:
        label_sample = [label if label != 0 else -100 for label in label_sample]
        labels_with_ignore_index.append(label_sample)

    model_inputs["labels"] = labels_with_ignore_index
    return model_inputs



mapped_dataset = dataset3.map(preprocess_data, batched=True, batch_size=2)
mapped_dataset

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/388 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 600
    })
    validate: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 388
    })
})

In [30]:
mapped_dataset["test"]["labels"][0]

[1,
 70,
 11,
 32,
 12083,
 67,
 10049,
 9407,
 5333,
 82,
 565,
 411,
 14330,
 612,
 1546,
 921,
 67,
 23,
 17,
 15557,
 6,
 508,
 1546,
 921,
 67,
 23,
 6,
 5333,
 82,
 1377,
 411,
 15557,
 5333,
 82,
 3639,
 411,
 3168,
 612,
 1546,
 921,
 67,
 23,
 17,
 15557,
 17,
 12388,
 6,
 5333,
 82,
 1850,
 411,
 5659,
 67,
 1126,
 612,
 1546,
 921,
 67,
 23,
 17,
 15557,
 17,
 12388,
 17,
 1126,
 6,
 1056,
 1546,
 3247,
 6441,
 17,
 20,
 18,
 25,
 300,
 20,
 18,
 25,
 300,
 20,
 18,
 25,
 300,
 20,
 18,
 25,
 300,
 20,
 18,
 25,
 374,
 18,
 25,
 300,
 20,
 18,
 25,
 374,
 18,
 25,
 300,
 20,
 18,
 25,
 300,
 20,
 18,
 25,
 374,
 18,
 25,
 374,
 18,
 25,
 374,
 18,
 25,
 300,
 20,
 18,
 25,
 300,
 20,
 18,
 25,
 374,
 18,
 25,
 300,
 20,
 18,
 25,
 374,
 18,
 25,
 374,
 18,
 25,
 374,
 18,
 25,
 300,
 20,
 18,
 25,
 374,
 18,
 25,
 374,
 18,
 25,
 374,
 18,
 25,
 1757,
 5659,
 67,
 1126,
 5333,
 82,
 1850,
 411,
 28012,
 82,
 2161,
 67,
 6054,
 5333,
 82,
 5411,
 411,
 3860,
 280,
 1084,
 154

In [31]:
import pickle

with open('6May24_v1_m512_M512_S1188_mapped_dataset.pkl', 'wb') as f:
    pickle.dump(mapped_dataset, f)

!zip 6May24_v1_m512_M512_S1188_mapped_dataset.pkl.zip 6May24_v1_m512_M512_S1188_mapped_dataset.pkl

  adding: 6May24_v1_m512_M512_S1188_mapped_dataset.pkl (deflated 98%)


In [None]:
from torch.utils.data import DataLoader
mapped_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
train_dataloader = DataLoader(mapped_dataset["train"], shuffle=True, batch_size=2)
valid_dataloader = DataLoader(mapped_dataset["validate"], batch_size=2)
test_dataloader = DataLoader(mapped_dataset["test"], batch_size=2)

In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




In [None]:
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

In [None]:
import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DefaultDataCollator

In [None]:
data_collator = DefaultDataCollator(return_tensors="tf")

mapped_train = mapped_dataset["train"]
mapped_validate = mapped_dataset["validate"]

In [None]:
dataset_training = mapped_train.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    batch_size=2,
    collate_fn=data_collator,
)
dataset_validation = mapped_validate.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=2,
    collate_fn=data_collator,
)

In [None]:
training_args = TrainingArguments(
    output_dir = "/",
    num_train_epochs = 3,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    save_steps = 400,
    save_total_limit = 2,
    eval_steps = 100,
    eval_strategy = "steps",
    fp16 = True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = mapped_dataset["train"],
    eval_dataset = mapped_dataset["validate"]
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("29Apr24_v1")
!zip -r 29Apr24_model_v1.zip 29Apr24_v1

In [None]:
# @title Testing

from transformers import T5ForConditionalGeneration, T5Tokenizer
model_path = "29Apr24_v1"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')

def translator(model, tokenizer):
  inputs = """
  {
    "large_cube_location": "<Vector (0.0000, 0.0000, 10.5000)>",
    "large_cube_size": "<Vector (11.0000, 24.0000, 21.0000)>",
    "Object0_location": "<Vector (-0.5000, 5.0000, 0.5000)>",
    "Object0_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object1_location": "<Vector (-3.5000, -4.0000, 0.5000)>",
    "Object1_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object2_location": "<Vector (1.5000, -3.0000, 0.5000)>",
    "Object2_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object3_location": "<Vector (-0.5000, 8.0000, 0.5000)>",
    "Object3_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object4_location": "<Vector (4.5000, -4.0000, 0.5000)>",
    "Object4_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object5_location": "<Vector (2.5000, -3.0000, 0.5000)>",
    "Object5_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object6_location": "<Vector (-0.5000, -8.0000, 0.5000)>",
    "Object6_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object7_location": "<Vector (2.5000, -6.0000, 0.5000)>",
    "Object7_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object8_location": "<Vector (2.5000, 3.0000, 0.5000)>",
    "Object8_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object9_location": "<Vector (-3.5000, 3.0000, 0.5000)>",
    "Object9_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object10_location": "<Vector (1.5000, -11.0000, 0.5000)>",
    "Object10_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object11_location": "<Vector (-1.5000, 0.0000, 0.5000)>",
    "Object11_size": "<Vector (1.0000, 1.0000, 1.0000)>",
    "Object12_location": "<Vector (2.5000, 11.0000, 0.5000)>",
    "Object12_size": "<Vector (1.0000, 1.0000, 1.0000)>"
  }
  """
  input_tokens = tokenizer.encode(
      inputs,
      return_tensors = "pt",
      max_length = 512,
      truncation = True
  )

  corrected_ids = model.generate(
      input_tokens,
      max_length = 1024
  )

  corrected_text = tokenizer.decode(corrected_ids[0], skip_special_tokens = True)
  return corrected_text

tokenstuff = translator(model, tokenizer)
print(tokenstuff)