In [1]:
!pip install datasets
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate

import ipywidgets as widgets
import pandas as pd
import json
import torch

from datasets import load_dataset
# LOCAL
with open("./content/mergedJson.json", "rb") as jsonl_file:
    data_list = json.load(jsonl_file)

from datasets import Dataset
from datasets.dataset_dict import DatasetDict

del data_list["GENERATED_DATA"][len(data_list["GENERATED_DESCRIPTION"]):]
dataframe = pd.DataFrame(data_list, columns=["GENERATED_DESCRIPTION", "GENERATED_DATA"])
dataframe

dataframe_train = Dataset.from_pandas(dataframe[:600])
dataframe_validate = Dataset.from_pandas(dataframe[600:800])
dataframe_test = Dataset.from_pandas(dataframe[800:])

dataset3 = DatasetDict({"train": dataframe_train, "validate": dataframe_validate, "test": dataframe_test})
print(dataset3)

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-z_lcllbw
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-z_lcllbw
  Resolved https://github.com/huggingface/transformers.git to commit f26e4073707189c93915227779a4f6ea3c40d43b
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
DatasetDict({
    train: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 600
    })
    validate: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 200
    })
    test: Dataset({
        features: ['GENERATED_DESCRIPTION', 'GENERATED_DATA'],
        num_rows: 448
    })
})


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-small")

import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)



In [3]:
def prepare_train_features(inputData):
    input_sequences = inputData["GENERATED_DESCRIPTION"]
    input_sequences = [str(seq) for seq in input_sequences]

    output_sequences = inputData["GENERATED_DATA"]
    output_sequences = [str(seq) for seq in output_sequences]

    
    tokenized_data = tokenizer(
        input_sequences,
        output_sequences,
        truncation = "only_second",
        max_length = 500,
        stride = 128,
        return_overflowing_tokens = True,
        return_offsets_mapping = True,
        padding = "max_length",
    )

    # Map from feature to corresponding example:
    sample_mapping = tokenized_data.pop("overflow_to_sample_mapping")
    # Offset map to find start/end of description/data
    offset_mapping = tokenized_data.pop("offset_mapping")

    # Labelling:
    tokenized_data["start_positions"] = []
    tokenized_data["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_data["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Sequence ids: "0" for description, "1" for data, "None" for special tokens
        sequence_ids = tokenized_data.sequence_ids(i)
        sample_index = sample_mapping[i]
        sample_data = inputData["GENERATED_DATA"][sample_index]

        token_start_index = 0
        while sequence_ids[token_start_index] != 0:
            token_start_index += 1
            
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        tokenized_data["start_positions"].append(token_start_index)
        tokenized_data["end_positions"].append(token_end_index)

    return tokenized_data
         

In [4]:
mapped_dataset = dataset3.map(prepare_train_features, remove_columns=dataset3["train"].column_names, batched=True, batch_size=2)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/448 [00:00<?, ? examples/s]

In [5]:
mapped_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1800
    })
    validate: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 600
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1344
    })
})

In [7]:
!pip install tf-keras
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained("Salesforce/codet5-small")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tf-keras
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m01[0m
[?25hInstalling collected packages: tf-keras
Successfully installed tf-keras-2.16.0


Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at Salesforce/codet5-small and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir = "./",
    num_train_epochs = 3,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    save_steps = 400,
    save_total_limit = 2,
    eval_steps = 100,
    eval_strategy = "steps",
    fp16 = True
)

from transformers import default_data_collator
data_collator = default_data_collator

trainer = Trainer(
    model,
    training_args,
    train_dataset = mapped_dataset["train"],
    eval_dataset = mapped_dataset["validate"],
    data_collator = data_collator,
    tokenizer = tokenizer
)

In [12]:
trainer.train()

ValueError: not enough values to unpack (expected 2, got 1)