In [7]:
import json
from datasets import load_from_disk
from transformers import RobertaTokenizer
from process_java_model import *


In [2]:
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")


In [3]:
dataset = load_from_disk("/data/nicolasmaier/dataset/hf_clean_dataset")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 425631
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14634
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 25156
    })
})


In [None]:
MAX_OUTPUT_LENGTH = 512


def preprocess_examples(examples):
    xmi = examples["xmi"]

    seqs = [generate_sequence(xmi_string) for xmi_string in xmi]
    seqs = [json.dumps(seq) for seq in seqs]

    labels = tokenizer(seqs, padding="max_length").input_ids
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels
    ]

    return {"seq": seqs, "labels": labels}


seq_dataset = dataset.map(
    preprocess_examples, batched=True, batch_size=100, num_proc=64
)


In [27]:
print(seq_dataset)
seq_dataset.save_to_disk("/data/nicolasmaier/dataset/hf_seq_dataset")

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels', 'seq'],
        num_rows: 425631
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels', 'seq'],
        num_rows: 14634
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels', 'seq'],
        num_rows: 25156
    })
})


In [None]:
clean_seq_dataset = seq_dataset.filter(
    lambda example: len(example["seq"]) >= 5 and len(example["labels"]) <= 512,
    num_proc=64,
)


In [29]:
print(clean_seq_dataset)
clean_seq_dataset.save_to_disk("/data/nicolasmaier/dataset/hf_clean_seq_dataset")

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels', 'seq'],
        num_rows: 385339
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels', 'seq'],
        num_rows: 13524
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels', 'seq'],
        num_rows: 22607
    })
})


Flattening the indices:   0%|          | 0/386 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/14 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/23 [00:00<?, ?ba/s]