In [2]:
from transformers import RobertaTokenizer
from datasets import load_dataset, load_from_disk
import os

In [3]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')

In [3]:
DATASET_PATH = "../dataset/codesearchnet-java-discovered/"

data_files = {"train": [], "valid": [], "test": []}

for file in os.listdir(DATASET_PATH):
    file_path = os.path.join(DATASET_PATH, file)
    if "train" in file:
        data_files["train"].append(file_path)
    elif "valid" in file:
        data_files["valid"].append(file_path)
    elif "test" in file:
        data_files["test"].append(file_path)

print(data_files)

dataset = load_dataset("json", data_files=data_files)
print(dataset)


{'train': ['../dataset/codesearchnet-java-discovered/java_train_0.jsonl', '../dataset/codesearchnet-java-discovered/java_train_1.jsonl', '../dataset/codesearchnet-java-discovered/java_train_6.jsonl', '../dataset/codesearchnet-java-discovered/java_train_8.jsonl', '../dataset/codesearchnet-java-discovered/java_train_5.jsonl', '../dataset/codesearchnet-java-discovered/java_train_11.jsonl', '../dataset/codesearchnet-java-discovered/java_train_13.jsonl', '../dataset/codesearchnet-java-discovered/java_train_15.jsonl', '../dataset/codesearchnet-java-discovered/java_train_7.jsonl', '../dataset/codesearchnet-java-discovered/java_train_3.jsonl', '../dataset/codesearchnet-java-discovered/java_train_2.jsonl', '../dataset/codesearchnet-java-discovered/java_train_12.jsonl', '../dataset/codesearchnet-java-discovered/java_train_4.jsonl', '../dataset/codesearchnet-java-discovered/java_train_9.jsonl', '../dataset/codesearchnet-java-discovered/java_train_14.jsonl', '../dataset/codesearchnet-java-discover

Using custom data configuration default-70a21001498a12cd
Found cached dataset json (/data/nicolasmaier/huggingface_cache/json/default-70a21001498a12cd/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine'],
        num_rows: 454273
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine'],
        num_rows: 15326
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine'],
        num_rows: 26902
    })
})


In [None]:
MAX_INPUT_LENGTH = 512

def preprocess_examples(examples):
    contents = examples["contents"]
    xmi = examples["xmi"]

    model_inputs = tokenizer(contents, padding="max_length")
    labels = tokenizer(xmi).input_ids

    model_inputs["labels"] = labels
    return model_inputs


tokenized_dataset = dataset.map(preprocess_examples, batched=True, batch_size=100, num_proc=64)


In [None]:
clean_dataset = tokenized_dataset.filter(lambda example: len(example["input_ids"]) <= 512, num_proc=64)

In [7]:
clean_dataset.save_to_disk("/data/nicolasmaier/dataset/hf_clean_dataset")

Flattening the indices:   0%|          | 0/426 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/15 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/26 [00:00<?, ?ba/s]

In [29]:
clean_dataset = load_from_disk("/data/nicolasmaier/dataset/hf_clean_dataset")

In [None]:
def crop_labels(examples):
    labels = tokenizer(examples["xmi"], padding="longest", truncation=True).input_ids
    # -100 is a special value that the loss function will ignore
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels
    ]

    examples["labels"] = labels
    return examples

cropped_dataset = clean_dataset.map(crop_labels, batched=True, batch_size=100, num_proc=64)

print(cropped_dataset)

In [5]:
cropped_dataset.save_to_disk("/data/nicolasmaier/dataset/hf_cropped_dataset")

In [30]:
print(clean_dataset)

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 425631
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14634
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 25156
    })
})


In [42]:
def split_labels(examples):
    window_size = 400
    stride = 200

    res = {
        "idx": [],
        "labels": [],
        "input_ids": [],
        "attention_mask": [],
    }

    for idx in range(len(examples["labels"])):
        label = examples["labels"][idx]
        for i in range(1, len(label), stride):
            res["idx"].append(idx)
            labels = label[i : i + window_size]
            res["labels"].append([1] + labels + [-100] * (window_size - len(labels)))
            res["input_ids"].append(examples["input_ids"][idx])
            res["attention_mask"].append(examples["attention_mask"][idx])

    return res


split_dataset = clean_dataset.map(split_labels, batched=True, batch_size=100, num_proc=32, remove_columns=["xmi", "contents", "originalLine", "code"])

print(split_dataset)

                                  

#0:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/134 [00:00<?, ?ba/s]

#2:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/134 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/134 [00:00<?, ?ba/s]

#6:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/134 [00:00<?, ?ba/s]

  

#10:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#13:   0%|          | 0/134 [00:00<?, ?ba/s]

#14:   0%|          | 0/134 [00:00<?, ?ba/s]

  

#15:   0%|          | 0/134 [00:00<?, ?ba/s]

#16:   0%|          | 0/134 [00:00<?, ?ba/s]

  

#17:   0%|          | 0/134 [00:00<?, ?ba/s]

#18:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/134 [00:00<?, ?ba/s]

  

#21:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/134 [00:00<?, ?ba/s]

 

#25:   0%|          | 0/134 [00:00<?, ?ba/s]

   

#26:   0%|          | 0/134 [00:00<?, ?ba/s]

  

#27:   0%|          | 0/134 [00:00<?, ?ba/s]

#29:   0%|          | 0/134 [00:00<?, ?ba/s]

#28:   0%|          | 0/134 [00:00<?, ?ba/s]

#30:   0%|          | 0/134 [00:00<?, ?ba/s]

#31:   0%|          | 0/133 [00:00<?, ?ba/s]

                                  

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

#2:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/5 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/5 [00:00<?, ?ba/s]

#7:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/5 [00:00<?, ?ba/s]

  

#10:   0%|          | 0/5 [00:00<?, ?ba/s]

#11:   0%|          | 0/5 [00:00<?, ?ba/s]

  

#12:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#13:   0%|          | 0/5 [00:00<?, ?ba/s]

#14:   0%|          | 0/5 [00:00<?, ?ba/s]

  

#15:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#16:   0%|          | 0/5 [00:00<?, ?ba/s]

#17:   0%|          | 0/5 [00:00<?, ?ba/s]

  

#18:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#21:   0%|          | 0/5 [00:00<?, ?ba/s]

#22:   0%|          | 0/5 [00:00<?, ?ba/s]

  

#23:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#25:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#26:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#27:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#28:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#29:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#30:   0%|          | 0/5 [00:00<?, ?ba/s]

#31:   0%|          | 0/5 [00:00<?, ?ba/s]

                                 

#0:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/8 [00:00<?, ?ba/s]

#2:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/8 [00:00<?, ?ba/s]

#7:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#8:   0%|          | 0/8 [00:00<?, ?ba/s]

#9:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#11:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#13:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/8 [00:00<?, ?ba/s]

#15:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#16:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#18:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#21:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#25:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#26:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#27:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#28:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#29:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#30:   0%|          | 0/8 [00:00<?, ?ba/s]

#31:   0%|          | 0/8 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'idx'],
        num_rows: 7931293
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'idx'],
        num_rows: 255994
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'idx'],
        num_rows: 476050
    })
})


In [43]:
split_dataset.save_to_disk("/data/nicolasmaier/dataset/hf_split_dataset")