In [1]:
!pip install datasets
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate

import ipywidgets as widgets
import pandas as pd
import json
import torch

from datasets import load_dataset

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-8mrwxcm8
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-8mrwxcm8
  Resolved https://github.com/huggingface/transformers.git to commit 2b9e252b16396c926dad0e3c31802b4af8004e93
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.42.0.dev0-py3-none-any.whl size=9131386 sha256=b8dad0630988ab1aad6e924bb6bcb525f9929c11a92014ef7f809c7a0bf08eeb
  Stored in directory: /tmp/pip-ephem-wheel-cache-uy4axv7k/wheels/54/cb/3f/83103de5575c53443

In [None]:
# GOOGLE COLAB
# Upload the mergedJson.json zip file, to preserve contents
!unzip /content/mergedJson.zip
!sha1sum /content/mergedJson.json
# Should be 12d88910c17f2e7592f6cb9005bf86cdb557e031

Archive:  /content/mergedJson.zip
  inflating: mergedJson.json         
12d88910c17f2e7592f6cb9005bf86cdb557e031  /content/mergedJson.json


In [2]:
with open("/content/mergedJson.json", "rb") as jsonl_file:
    data_list = json.load(jsonl_file)

FileNotFoundError: [Errno 2] No such file or directory: '/content/mergedJson.json'

In [3]:
# LOCAL
with open("./content/mergedJson.json", "rb") as jsonl_file:
    data_list = json.load(jsonl_file)

In [4]:
from datasets import Dataset
from datasets.dataset_dict import DatasetDict

del data_list["GENERATED_DATA"][len(data_list["GENERATED_DESCRIPTION"]):]

len(data_list["GENERATED_DATA"])


1248

In [5]:
data_lengths = [len(x) for x in data_list["GENERATED_DATA"]]
descript_lengths = [len(x) for x in data_list["GENERATED_DESCRIPTION"]]
print("Data: " + str(max(data_lengths)) + ", " + "Description: " + str(max(descript_lengths)))

Data: 2769, Description: 98


In [6]:
dataframe = pd.DataFrame(data_list, columns=["GENERATED_DESCRIPTION", "GENERATED_DATA"])
dataframe

Unnamed: 0,GENERATED_DESCRIPTION,GENERATED_DATA
0,"{'location': '<Vector (0.0000, 0.0000, 12.5000...","b'<library_geometries>\n <geometry id=""Obje..."
1,"{'location': '<Vector (-4.0000, -4.0000, 0.500...","b'<library_geometries>\n <geometry id=""Obje..."
2,"{'location': '<Vector (-8.0000, -2.0000, 0.500...","b'<library_geometries>\n <geometry id=""room..."
3,"{'location': '<Vector (9.0000, 0.0000, 0.5000)...","b'<library_geometries>\n <geometry id=""Obje..."
4,"{'location': '<Vector (6.0000, 6.0000, 0.5000)...","b'<library_geometries>\n <geometry id=""Obje..."
...,...,...
1243,"{'location': '<Vector (-1.5000, 5.5000, 0.5000...","b'<library_geometries>\n <geometry id=""Obje..."
1244,"{'location': '<Vector (2.5000, -6.5000, 0.5000...","b'<library_geometries>\n <geometry id=""room..."
1245,"{'location': '<Vector (3.5000, 9.5000, 0.5000)...","b'<library_geometries>\n <geometry id=""Obje..."
1246,"{'location': '<Vector (0.5000, -9.5000, 0.5000...","b'<library_geometries>\n <geometry id=""Obje..."


In [7]:

dataframe_train = Dataset.from_pandas(dataframe[:600])
dataframe_validate = Dataset.from_pandas(dataframe[600:800])
dataframe_test = Dataset.from_pandas(dataframe[800:])

dataset3 = DatasetDict({"train": dataframe_train, "validate": dataframe_validate, "test": dataframe_test})
print(dataset3)

DatasetDict({
    train: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 600
    })
    validate: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 200
    })
    test: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 448
    })
})


In [15]:
len(dataset3["train"]["GENERATED_DATA"][0])

2697

In [8]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

max_source_length = 512
max_target_length = 4096

# task_prefix = "json2dae: "


In [9]:
def preprocess_data(inputData):
    input_sequences = inputData["GENERATED_DESCRIPTION"]
    input_sequences = [str(seq) for seq in input_sequences]

    output_sequences = inputData["GENERATED_DATA"]
    output_sequences = [str(seq) for seq in output_sequences]

    model_inputs = tokenizer(
        input_sequences,
        max_length = max_source_length,
        padding = "max_length",
        truncation = False
    )

    labels = tokenizer(
        output_sequences,
        max_length = max_target_length,
        padding = "max_length",
        truncation = False
    )

    # Replace padding tokens with -100 so they are
    # ignored by CrossEntropyLoss thing
    #labels_with_ignore_index = []
    #for sample in labels:
    #    sample = [label if label != 0 else -100 for label in sample]
    #    labels_with_ignore_index.append(sample)

    labels_with_ignore_index = []
    for label_sample in labels["input_ids"]:
        label_sample = [label if label != 0 else -100 for label in label_sample]
        labels_with_ignore_index.append(label_sample)

    model_inputs["labels"] = labels_with_ignore_index
    return model_inputs



mapped_dataset = dataset3.map(preprocess_data, batched=True, batch_size=2)
mapped_dataset

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/448 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 600
    })
    validate: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 448
    })
})

In [22]:
len(mapped_dataset["train"]["labels"][0])

4096

In [None]:
import pickle

with open('6May24_v1_m512_M512_S1188_mapped_dataset.pkl', 'wb') as f:
    pickle.dump(mapped_dataset, f)

!zip 6May24_v1_m512_M512_S1188_mapped_dataset.pkl.zip 6May24_v1_m512_M512_S1188_mapped_dataset.pkl

In [None]:
from torch.utils.data import DataLoader
mapped_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
train_dataloader = DataLoader(mapped_dataset["train"], shuffle=True, batch_size=2)
valid_dataloader = DataLoader(mapped_dataset["validate"], batch_size=2)
test_dataloader = DataLoader(mapped_dataset["test"], batch_size=2)

In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

In [None]:
import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DefaultDataCollator

In [None]:
data_collator = DefaultDataCollator(return_tensors="tf")

mapped_train = mapped_dataset["train"]
mapped_validate = mapped_dataset["validate"]

In [None]:
dataset_training = mapped_train.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    batch_size=2,
    collate_fn=data_collator,
)
dataset_validation = mapped_validate.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=2,
    collate_fn=data_collator,
)

In [None]:
training_args = TrainingArguments(
    output_dir = "/",
    num_train_epochs = 3,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    save_steps = 400,
    save_total_limit = 2,
    eval_steps = 100,
    eval_strategy = "steps",
    fp16 = True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = mapped_dataset["train"],
    eval_dataset = mapped_dataset["validate"]
)

In [None]:
trainer.train()
#A100 GPU == 21.8/40GB used, 7.5min

Step,Training Loss,Validation Loss
100,No log,0.316607
200,No log,0.113534
300,No log,0.05839
400,No log,0.026888
500,0.445200,0.019753
600,0.445200,0.016668
700,0.445200,0.015266
800,0.445200,0.013299
900,0.445200,0.012917


TrainOutput(global_step=900, training_loss=0.26732671949598524, metrics={'train_runtime': 443.2813, 'train_samples_per_second': 4.061, 'train_steps_per_second': 2.03, 'total_flos': 243615242649600.0, 'train_loss': 0.26732671949598524, 'epoch': 3.0})

In [None]:
trainer.save_model("11May24_v1")
!zip -r 11May24_model_v1.zip 11May24_v1
# 24 mins to save

  adding: 11May24_v1/ (stored 0%)
  adding: 11May24_v1/training_args.bin (deflated 51%)
  adding: 11May24_v1/model.safetensors (deflated 7%)
  adding: 11May24_v1/config.json (deflated 61%)
  adding: 11May24_v1/generation_config.json (deflated 34%)


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model_path = "11May24_v1"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')

In [None]:
# @title Testing

def translator(model, tokenizer):
  inputs = """
        {'location': '<Vector (0.0000, 0.0000, 9.0000)>', 'size': '<Vector (10.0000, 13.0000, 18.0000)>'}

  """
  input_tokens = tokenizer.encode(
      inputs,
      return_tensors = "pt",
      max_length = 512,
      truncation = True
  )

  corrected_ids = model.generate(
      input_tokens,
      max_length=4096,
      early_stopping=False
  )

  corrected_text = tokenizer.decode(corrected_ids[0], skip_special_tokens = True)
  return corrected_text

tokenstuff = translator(model, tokenizer)
#A100 19s
print(tokenstuff)

b'<library_geometries>\n    <geometry id="Object_6-mesh" name="Object_6">\n      <mesh>\n        <source id="Object_6-mesh-positions">\n          <float_array id="Object_6-mesh-positions-array" count="24">-0.5 -0.5 -0.5 -0.5 0.5 -0.5 0.5 -0.5 -0.5 -0.5 0.5 0.5 0.5 -0.5 -0.5 0.5 -0.5 0.5 0.5 0.5 -0.5 0.5 0.5 0.5</float_array>\n          <technique_common>\n            <accessor source="#Object_6-mesh-positions-array" count="8" stride="3">\n              <param name="X" type="float" />\n              <param name="Y" type="float" />\n              <param name="Z" type="float" />\n            </accessor>\n          </technique_common>\n        </source>\n        <source id="Object_6-mesh-normals">\n          <float_array id="Object_6-mesh-normals-array" count="18">-1 0 0 0 1 0 1 0 0 0 -1 0 0 0 -1 0 0 1</float_array>\n          <technique_common>\n            <accessor source="#Object_6-mesh-normals-array" count="6" stride="3">\n              <param name="X" type="float" />\n              <

In [None]:
tokenstuff

'b\'<library_geometries>\\n    <geometry id="Object_6-mesh" name="Object_6">\\n      <mesh>\\n        <source id="Object_6-mesh-positions">\\n          <float_array id="Object_6-mesh-positions-array" count="24">-0.5 -0.5 -0.5 -0.5 0.5 -0.5 0.5 -0.5 -0.5 -0.5 0.5 0.5 0.5 -0.5 -0.5 0.5 -0.5 0.5 0.5 0.5 -0.5 0.5 0.5 0.5</float_array>\\n          <technique_common>\\n            <accessor source="#Object_6-mesh-positions-array" count="8" stride="3">\\n              <param name="X" type="float" />\\n              <param name="Y" type="float" />\\n              <param name="Z" type="float" />\\n            </accessor>\\n          </technique_common>\\n        </source>\\n        <source id="Object_6-mesh-normals">\\n          <float_array id="Object_6-mesh-normals-array" count="18">-1 0 0 0 1 0 1 0 0 0 -1 0 0 0 -1 0 0 1</float_array>\\n          <technique_common>\\n            <accessor source="#Object_6-mesh-normals-array" count="6" stride="3">\\n              <param name="X" type="float" 

In [None]:
tokenstuff