In [1]:
!pip install datasets ipywidgets
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate

import ipywidgets as widgets
import pandas as pd
import json
import torch

from datasets import load_dataset

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

In [2]:
# Upload the mergedJson.jsonl zip file, to preserve contents
!unzip /content/mergedJson.jsonl.zip
!sha1sum /content/mergedJson.jsonl
# Should be 59df712cdc761401e4a44b3c17aaa48e18c0a214

Archive:  /content/mergedJson.jsonl.zip
  inflating: mergedJson.jsonl        
59df712cdc761401e4a44b3c17aaa48e18c0a214  /content/mergedJson.jsonl


In [3]:
with open("/content/mergedJson.jsonl", "r") as jsonl_file:
    lines = jsonl_file.readlines()

data_list = []
for line in lines:
    data = json.loads(line)
    data_list.append(data)

dataframe = pd.DataFrame(data_list, columns=["GENERATED_DESCRIPTION", "GENERATED_DATA"])

dataframe

Unnamed: 0,GENERATED_DESCRIPTION,GENERATED_DATA
0,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
1,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
2,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
3,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
4,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
...,...,...
995,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
996,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
997,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
998,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...


In [4]:
dataframe

# dataframe[0] = Description
# dataframe[1] = Data

Unnamed: 0,GENERATED_DESCRIPTION,GENERATED_DATA
0,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
1,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
2,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
3,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
4,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
...,...,...
995,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
996,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
997,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
998,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...


In [None]:
# @title Starcoder
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
# checkpoint = "bigcode/starcoder2-7b" # Took an awfully long time to download (15mb/s)
checkpoint = "bigcode/starcoder2-3b" # Crashing colab, OOM

device = "cuda" # or cuda for gpu
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16).to("cuda")

training_args = TrainingArguments(
    output_dir="./_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=400,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataframe,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# @title Phi-2/stability-code
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("stabilityai/stable-code-3b", torch_dtype="auto", trust_remote_code=True).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("stabilityai/stable-code-3b", trust_remote_code=True)

training_args = TrainingArguments(
    output_dir="./_results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    save_steps=400,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataframe,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# @title Default title text
model = None
with torch.no_grad():
    torch.cuda.empty_cache()

In [None]:
# @title Default title text
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

from nltk.tokenize import word_tokenize

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

def tokenize_function(examples):
  return tokenizer(examples[1], padding="max_length", truncation=True)

"""
description_tokens = tokenizer(
    dataframe['TOKENIZED_DESCRIPTION'],
    return_tensors="pt",
    padding=True,
    truncation=True,
)

data_tokens = tokenizer(
    dataframe['GENERATED_DATA'].values.tolist(),
    return_tensors="pt",
    padding=True,
    truncation=True,
)
"""

training_args = TrainingArguments(
    output_dir="./_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataframe,
    tokenizer=tokenizer,
)

trainer.train()

'\ndescription_tokens = tokenizer(\n    dataframe[\'TOKENIZED_DESCRIPTION\'],\n    return_tensors="pt",\n    padding=True,\n    truncation=True,\n)\n\ndata_tokens = tokenizer(\n    dataframe[\'GENERATED_DATA\'].values.tolist(),\n    return_tensors="pt",\n    padding=True,\n    truncation=True,\n)\n'

In [20]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

# Generated Description: approx 5.5k words or sm

def tokenize_function(examples):
  return tokenizer(examples, max_length=5120, truncation=True)


In [7]:
sub_data = dataframe.iloc[:100]
sub_data = sub_data.astype(str)

sub_data

Unnamed: 0,GENERATED_DESCRIPTION,GENERATED_DATA
0,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
1,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
2,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
3,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
4,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
...,...,...
95,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
96,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
97,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...
98,"{'large_cube_location': '<Vector (0.0000, 0.00...",{'COLLADA': {'@xmlns': 'http://www.collada.org...


In [24]:
sub_data_desc = sub_data['GENERATED_DESCRIPTION']
sub_data_data = sub_data['GENERATED_DATA']

sub_data_desc = sub_data_desc.values.tolist()

tokenized_desc = tokenizer(sub_data_desc, return_tensors="pt", padding=True, truncation=True)

tokenized_desc

{'input_ids': tensor([[    1, 27828, 14095,  ...,   921,    29,     2],
        [    1, 27828, 14095,  ...,   296,   921,     2],
        [    1, 27828, 14095,  ...,  1870,    16,     2],
        ...,
        [    1, 27828, 14095,  ...,    13,  1870,     2],
        [    1, 27828, 14095,  ...,    13,  1870,     2],
        [    1, 27828, 14095,  ...,    13,  1870,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [26]:
sub_data_data = sub_data_data.values.tolist()
tokenized_data = tokenizer(sub_data_data, return_tensors="pt", padding=True, truncation=True)

tokenized_data

{'input_ids': tensor([[    1, 27828,  4935,  ...,   374,   404,     2],
        [    1, 27828,  4935,  ...,   374,   404,     2],
        [    1, 27828,  4935,  ...,   374,   404,     2],
        ...,
        [    1, 27828,  4935,  ...,   374,   404,     2],
        [    1, 27828,  4935,  ...,  4278,  2400,     2],
        [    1, 27828,  4935,  ...,   374,   404,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [27]:
tokenized_data["labels"] = tokenized_desc["input_ids"]
tokenized_data

{'input_ids': tensor([[    1, 27828,  4935,  ...,   374,   404,     2],
        [    1, 27828,  4935,  ...,   374,   404,     2],
        [    1, 27828,  4935,  ...,   374,   404,     2],
        ...,
        [    1, 27828,  4935,  ...,   374,   404,     2],
        [    1, 27828,  4935,  ...,  4278,  2400,     2],
        [    1, 27828,  4935,  ...,   374,   404,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[    1, 27828, 14095,  ...,   921,    29,     2],
        [    1, 27828, 14095,  ...,   296,   921,     2],
        [    1, 27828, 14095,  ...,  1870,    16,     2],
        ...,
        [    1, 27828, 14095,  ...,    13,  1870,     2],
        [    1, 27828, 14095,  ...,    13,  1870,     2],
        [    1, 27828, 14095,  ...,    13,  1870,     2]])}

In [28]:
training_args = TrainingArguments(
    output_dir="./_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
)

trainer.train()



KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

In [31]:
tokenized_data.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])