In [1]:
import json
from datasets import load_from_disk, load_dataset
from transformers import RobertaTokenizer
from process_java_model import *
import os


In [2]:
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")


In [3]:
DATASET_PATH = "../dataset/codesearchnet-java-discovered/"

data_files = {"train": [], "valid": [], "test": []}

for file in os.listdir(DATASET_PATH):
    file_path = os.path.join(DATASET_PATH, file)
    if "train" in file:
        data_files["train"].append(file_path)
    elif "valid" in file:
        data_files["valid"].append(file_path)
    elif "test" in file:
        data_files["test"].append(file_path)

print(data_files)

dataset = load_dataset("json", data_files=data_files)
print(dataset)


{'train': ['../dataset/codesearchnet-java-discovered/java_train_0.jsonl', '../dataset/codesearchnet-java-discovered/java_train_1.jsonl', '../dataset/codesearchnet-java-discovered/java_train_6.jsonl', '../dataset/codesearchnet-java-discovered/java_train_8.jsonl', '../dataset/codesearchnet-java-discovered/java_train_5.jsonl', '../dataset/codesearchnet-java-discovered/java_train_11.jsonl', '../dataset/codesearchnet-java-discovered/java_train_13.jsonl', '../dataset/codesearchnet-java-discovered/java_train_15.jsonl', '../dataset/codesearchnet-java-discovered/java_train_7.jsonl', '../dataset/codesearchnet-java-discovered/java_train_3.jsonl', '../dataset/codesearchnet-java-discovered/java_train_2.jsonl', '../dataset/codesearchnet-java-discovered/java_train_12.jsonl', '../dataset/codesearchnet-java-discovered/java_train_4.jsonl', '../dataset/codesearchnet-java-discovered/java_train_9.jsonl', '../dataset/codesearchnet-java-discovered/java_train_14.jsonl', '../dataset/codesearchnet-java-discover

Using custom data configuration default-70a21001498a12cd
Found cached dataset json (/data/nicolasmaier/huggingface_cache/json/default-70a21001498a12cd/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine'],
        num_rows: 454273
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine'],
        num_rows: 15326
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine'],
        num_rows: 26902
    })
})


In [None]:
def preprocess_examples(examples):
    contents = examples["code"]
    model_inputs = tokenizer(contents)
    return model_inputs


dataset_with_input = dataset.map(
    preprocess_examples,
    batched=True,
    batch_size=100,
    num_proc=64,
)


In [None]:
def preprocess_examples(examples):
    xmi = examples["xmi"]

    seqs = [generate_sequence(xmi_string) for xmi_string in xmi]
    seqs = [json.dumps(seq) for seq in seqs]

    labels = tokenizer(seqs).input_ids

    return {"seq": seqs, "labels": labels}


dataset_with_output = dataset_with_input.map(
    preprocess_examples,
    batched=True,
    batch_size=10,
    num_proc=64,
)


In [6]:
print(dataset_with_output)
dataset_with_output.save_to_disk("/data/nicolasmaier/dataset/hf_seq_dataset_3")


DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 454273
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 15326
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 26902
    })
})


In [7]:
MAX_LENGTH = 505

dataset_filtered = dataset_with_output.filter(
    lambda example: len(example["input_ids"]) <= MAX_LENGTH, num_proc=64
)
print(dataset_filtered)
dataset_filtered = dataset_filtered.filter(
    lambda example: len(example["labels"]) <= MAX_LENGTH,
    num_proc=64,
)
print(dataset_filtered)
dataset_filtered = dataset_filtered.filter(
    lambda example: len(example["seq"]) > 10,
    num_proc=64,
)
print(dataset_filtered)


                                                                   

#9:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/8 [00:00<?, ?ba/s]

#0:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#7:   0%|          | 0/8 [00:00<?, ?ba/s]

#3:   0%|          | 0/8 [00:00<?, ?ba/s]

#5:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/8 [00:00<?, ?ba/s]

      

#29:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/8 [00:00<?, ?ba/s]

      

#8:   0%|          | 0/8 [00:00<?, ?ba/s]

#19:   0%|          | 0/8 [00:00<?, ?ba/s]

#16:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#21:   0%|          | 0/8 [00:00<?, ?ba/s]

#13:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/8 [00:00<?, ?ba/s]

#25:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/8 [00:00<?, ?ba/s]

#10:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/8 [00:00<?, ?ba/s]

#18:   0%|          | 0/8 [00:00<?, ?ba/s]

#22:   0%|          | 0/8 [00:00<?, ?ba/s]

#23:   0%|          | 0/8 [00:00<?, ?ba/s]

#15:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#20:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#40:   0%|          | 0/8 [00:00<?, ?ba/s]

   

#26:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#31:   0%|          | 0/8 [00:00<?, ?ba/s]

#42:   0%|          | 0/8 [00:00<?, ?ba/s]

   

#30:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#32:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#27:   0%|          | 0/8 [00:00<?, ?ba/s]

#24:   0%|          | 0/8 [00:00<?, ?ba/s]

     

#28:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#34:   0%|          | 0/8 [00:00<?, ?ba/s]

#39:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#47:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#41:   0%|          | 0/8 [00:00<?, ?ba/s]

   

#48:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#35:   0%|          | 0/8 [00:00<?, ?ba/s]

#33:   0%|          | 0/8 [00:00<?, ?ba/s]

#45:   0%|          | 0/8 [00:00<?, ?ba/s]

     

#43:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#44:   0%|          | 0/8 [00:00<?, ?ba/s]

#46:   0%|          | 0/8 [00:00<?, ?ba/s]

#52:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#50:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#37:   0%|          | 0/8 [00:00<?, ?ba/s]

#36:   0%|          | 0/8 [00:00<?, ?ba/s]

  

#38:   0%|          | 0/8 [00:00<?, ?ba/s]

#51:   0%|          | 0/8 [00:00<?, ?ba/s]

 

#54:   0%|          | 0/8 [00:00<?, ?ba/s]

#53:   0%|          | 0/8 [00:00<?, ?ba/s]

#61:   0%|          | 0/8 [00:00<?, ?ba/s]

#49:   0%|          | 0/8 [00:00<?, ?ba/s]

#63:   0%|          | 0/8 [00:00<?, ?ba/s]

#55:   0%|          | 0/8 [00:00<?, ?ba/s]

#62:   0%|          | 0/8 [00:00<?, ?ba/s]

#58:   0%|          | 0/8 [00:00<?, ?ba/s]

#56:   0%|          | 0/8 [00:00<?, ?ba/s]

#59:   0%|          | 0/8 [00:00<?, ?ba/s]

#60:   0%|          | 0/8 [00:00<?, ?ba/s]

#57:   0%|          | 0/8 [00:00<?, ?ba/s]

                                                                  

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#32:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#33:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#34:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#35:   0%|          | 0/1 [00:00<?, ?ba/s]

#36:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#37:   0%|          | 0/1 [00:00<?, ?ba/s]

#38:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#39:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#40:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#41:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#42:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#43:   0%|          | 0/1 [00:00<?, ?ba/s]

#44:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#45:   0%|          | 0/1 [00:00<?, ?ba/s]

#46:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#47:   0%|          | 0/1 [00:00<?, ?ba/s]

#48:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#49:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#50:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#51:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#52:   0%|          | 0/1 [00:00<?, ?ba/s]

#53:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#54:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#55:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#56:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#57:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#58:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#59:   0%|          | 0/1 [00:00<?, ?ba/s]

#60:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#61:   0%|          | 0/1 [00:00<?, ?ba/s]

#62:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#63:   0%|          | 0/1 [00:00<?, ?ba/s]

                                                                  

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#32:   0%|          | 0/1 [00:00<?, ?ba/s]

#33:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#34:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#35:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#36:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#37:   0%|          | 0/1 [00:00<?, ?ba/s]

#38:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#39:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#40:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#41:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#42:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#43:   0%|          | 0/1 [00:00<?, ?ba/s]

#45:   0%|          | 0/1 [00:00<?, ?ba/s]

#44:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#47:   0%|          | 0/1 [00:00<?, ?ba/s]

#46:   0%|          | 0/1 [00:00<?, ?ba/s]

#48:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#49:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#50:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#51:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#52:   0%|          | 0/1 [00:00<?, ?ba/s]

#53:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#54:   0%|          | 0/1 [00:00<?, ?ba/s]

#56:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#55:   0%|          | 0/1 [00:00<?, ?ba/s]

#57:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#58:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#59:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#60:   0%|          | 0/1 [00:00<?, ?ba/s]

#61:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#62:   0%|          | 0/1 [00:00<?, ?ba/s]

#63:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 426513
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 14649
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 25214
    })
})
                                                                 

#0:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/7 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/7 [00:00<?, ?ba/s]

#4:   0%|          | 0/7 [00:00<?, ?ba/s]

   

#1:   0%|          | 0/7 [00:00<?, ?ba/s]

#2:   0%|          | 0/7 [00:00<?, ?ba/s]

#6:   0%|          | 0/7 [00:00<?, ?ba/s]

   

#11:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/7 [00:00<?, ?ba/s]

#8:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/7 [00:00<?, ?ba/s]

#10:   0%|          | 0/7 [00:00<?, ?ba/s]

   

#17:   0%|          | 0/7 [00:00<?, ?ba/s]

#12:   0%|          | 0/7 [00:00<?, ?ba/s]

#19:   0%|          | 0/7 [00:00<?, ?ba/s]

    

#16:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/7 [00:00<?, ?ba/s]

#15:   0%|          | 0/7 [00:00<?, ?ba/s]

#13:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/7 [00:00<?, ?ba/s]

#18:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#21:   0%|          | 0/7 [00:00<?, ?ba/s]

   

#23:   0%|          | 0/7 [00:00<?, ?ba/s]

#25:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/7 [00:00<?, ?ba/s]

       

#27:   0%|          | 0/7 [00:00<?, ?ba/s]

#28:   0%|          | 0/7 [00:00<?, ?ba/s]

#30:   0%|          | 0/7 [00:00<?, ?ba/s]

#29:   0%|          | 0/7 [00:00<?, ?ba/s]

#26:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#31:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#32:   0%|          | 0/7 [00:00<?, ?ba/s]

  

#33:   0%|          | 0/7 [00:00<?, ?ba/s]

#35:   0%|          | 0/7 [00:00<?, ?ba/s]

#34:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#37:   0%|          | 0/7 [00:00<?, ?ba/s]

#36:   0%|          | 0/7 [00:00<?, ?ba/s]

       

#39:   0%|          | 0/7 [00:00<?, ?ba/s]

#40:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#42:   0%|          | 0/7 [00:00<?, ?ba/s]

#41:   0%|          | 0/7 [00:00<?, ?ba/s]

#43:   0%|          | 0/7 [00:00<?, ?ba/s]

  

#44:   0%|          | 0/7 [00:00<?, ?ba/s]

#38:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#47:   0%|          | 0/7 [00:00<?, ?ba/s]

#45:   0%|          | 0/7 [00:00<?, ?ba/s]

#46:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#48:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#51:   0%|          | 0/7 [00:00<?, ?ba/s]

#49:   0%|          | 0/7 [00:00<?, ?ba/s]

   

#50:   0%|          | 0/7 [00:00<?, ?ba/s]

     

#54:   0%|          | 0/7 [00:00<?, ?ba/s]

#53:   0%|          | 0/7 [00:00<?, ?ba/s]

  

#52:   0%|          | 0/7 [00:00<?, ?ba/s]

  

#55:   0%|          | 0/7 [00:00<?, ?ba/s]

#59:   0%|          | 0/7 [00:00<?, ?ba/s]

#56:   0%|          | 0/7 [00:00<?, ?ba/s]

#61:   0%|          | 0/7 [00:00<?, ?ba/s]

#57:   0%|          | 0/7 [00:00<?, ?ba/s]

#60:   0%|          | 0/7 [00:00<?, ?ba/s]

#62:   0%|          | 0/7 [00:00<?, ?ba/s]

#63:   0%|          | 0/7 [00:00<?, ?ba/s]

#58:   0%|          | 0/7 [00:00<?, ?ba/s]

                                                                  

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#32:   0%|          | 0/1 [00:00<?, ?ba/s]

#33:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#34:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#35:   0%|          | 0/1 [00:00<?, ?ba/s]

#36:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#37:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#38:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#39:   0%|          | 0/1 [00:00<?, ?ba/s]

#40:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#41:   0%|          | 0/1 [00:00<?, ?ba/s]

#42:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#43:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#44:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#45:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#46:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#47:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#48:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#49:   0%|          | 0/1 [00:00<?, ?ba/s]

#50:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#51:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#52:   0%|          | 0/1 [00:00<?, ?ba/s]

#53:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#54:   0%|          | 0/1 [00:00<?, ?ba/s]

#55:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#56:   0%|          | 0/1 [00:00<?, ?ba/s]

#57:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#58:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#59:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#60:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#61:   0%|          | 0/1 [00:00<?, ?ba/s]

#62:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#63:   0%|          | 0/1 [00:00<?, ?ba/s]

                                                                 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

#32:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#33:   0%|          | 0/1 [00:00<?, ?ba/s]

#34:   0%|          | 0/1 [00:00<?, ?ba/s]

#35:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#36:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#37:   0%|          | 0/1 [00:00<?, ?ba/s]

#38:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#39:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#40:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#41:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#42:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#43:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#44:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#45:   0%|          | 0/1 [00:00<?, ?ba/s]

#46:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#47:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#48:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#49:   0%|          | 0/1 [00:00<?, ?ba/s]

#50:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#51:   0%|          | 0/1 [00:00<?, ?ba/s]

#52:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#53:   0%|          | 0/1 [00:00<?, ?ba/s]

#54:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#55:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#56:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#57:   0%|          | 0/1 [00:00<?, ?ba/s]

#58:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#59:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#60:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#61:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#62:   0%|          | 0/1 [00:00<?, ?ba/s]

#63:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 366308
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 13025
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 21574
    })
})
                                                                   

#0:   0%|          | 0/6 [00:00<?, ?ba/s]

#1:   0%|          | 0/6 [00:00<?, ?ba/s]

#2:   0%|          | 0/6 [00:00<?, ?ba/s]

  

#8:   0%|          | 0/6 [00:00<?, ?ba/s]

#7:   0%|          | 0/6 [00:00<?, ?ba/s]

          

#11:   0%|          | 0/6 [00:00<?, ?ba/s]

#9:   0%|          | 0/6 [00:00<?, ?ba/s]

#3:   0%|          | 0/6 [00:00<?, ?ba/s]

#6:   0%|          | 0/6 [00:00<?, ?ba/s]

#4:   0%|          | 0/6 [00:00<?, ?ba/s]

#12:   0%|          | 0/6 [00:00<?, ?ba/s]

#5:   0%|          | 0/6 [00:00<?, ?ba/s]

#18:   0%|          | 0/6 [00:00<?, ?ba/s]

#16:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/6 [00:00<?, ?ba/s]

#13:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#15:   0%|          | 0/6 [00:00<?, ?ba/s]

  

#33:   0%|          | 0/6 [00:00<?, ?ba/s]

#21:   0%|          | 0/6 [00:00<?, ?ba/s]

  

#27:   0%|          | 0/6 [00:00<?, ?ba/s]

     

#31:   0%|          | 0/6 [00:00<?, ?ba/s]

#29:   0%|          | 0/6 [00:00<?, ?ba/s]

#28:   0%|          | 0/6 [00:00<?, ?ba/s]

   

#22:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/6 [00:00<?, ?ba/s]

#30:   0%|          | 0/6 [00:00<?, ?ba/s]

#26:   0%|          | 0/6 [00:00<?, ?ba/s]

#20:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/6 [00:00<?, ?ba/s]

#17:   0%|          | 0/6 [00:00<?, ?ba/s]

  

#25:   0%|          | 0/6 [00:00<?, ?ba/s]

  

#38:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#37:   0%|          | 0/6 [00:00<?, ?ba/s]

#32:   0%|          | 0/6 [00:00<?, ?ba/s]

#24:   0%|          | 0/6 [00:00<?, ?ba/s]

  

#42:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#35:   0%|          | 0/6 [00:00<?, ?ba/s]

  

#45:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#44:   0%|          | 0/6 [00:00<?, ?ba/s]

  

#40:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#41:   0%|          | 0/6 [00:00<?, ?ba/s]

#34:   0%|          | 0/6 [00:00<?, ?ba/s]

      

#36:   0%|          | 0/6 [00:00<?, ?ba/s]

     

#50:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#48:   0%|          | 0/6 [00:00<?, ?ba/s]

#49:   0%|          | 0/6 [00:00<?, ?ba/s]

#39:   0%|          | 0/6 [00:00<?, ?ba/s]

   

#60:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#47:   0%|          | 0/6 [00:00<?, ?ba/s]

#53:   0%|          | 0/6 [00:00<?, ?ba/s]

#56:   0%|          | 0/6 [00:00<?, ?ba/s]

#46:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#52:   0%|          | 0/6 [00:00<?, ?ba/s]

#54:   0%|          | 0/6 [00:00<?, ?ba/s]

#55:   0%|          | 0/6 [00:00<?, ?ba/s]

#63:   0%|          | 0/6 [00:00<?, ?ba/s]

#61:   0%|          | 0/6 [00:00<?, ?ba/s]

#58:   0%|          | 0/6 [00:00<?, ?ba/s]

#57:   0%|          | 0/6 [00:00<?, ?ba/s]

#62:   0%|          | 0/6 [00:00<?, ?ba/s]

#43:   0%|          | 0/6 [00:00<?, ?ba/s]

#59:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#51:   0%|          | 0/6 [00:00<?, ?ba/s]

                                                                  

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

#32:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#33:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#34:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#35:   0%|          | 0/1 [00:00<?, ?ba/s]

#36:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#37:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#38:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#39:   0%|          | 0/1 [00:00<?, ?ba/s]

#40:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#41:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#42:   0%|          | 0/1 [00:00<?, ?ba/s]

#43:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#44:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#45:   0%|          | 0/1 [00:00<?, ?ba/s]

#46:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#47:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#48:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#49:   0%|          | 0/1 [00:00<?, ?ba/s]

#50:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#51:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#52:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#53:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#54:   0%|          | 0/1 [00:00<?, ?ba/s]

#55:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#56:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#57:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#58:   0%|          | 0/1 [00:00<?, ?ba/s]

#59:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#60:   0%|          | 0/1 [00:00<?, ?ba/s]

#61:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#62:   0%|          | 0/1 [00:00<?, ?ba/s]

#63:   0%|          | 0/1 [00:00<?, ?ba/s]

                                                                 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#32:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#33:   0%|          | 0/1 [00:00<?, ?ba/s]

#34:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#35:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#36:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#37:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#38:   0%|          | 0/1 [00:00<?, ?ba/s]

#39:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#40:   0%|          | 0/1 [00:00<?, ?ba/s]

#41:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#42:   0%|          | 0/1 [00:00<?, ?ba/s]

#43:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#44:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#45:   0%|          | 0/1 [00:00<?, ?ba/s]

#46:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#47:   0%|          | 0/1 [00:00<?, ?ba/s]

#48:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#49:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#51:   0%|          | 0/1 [00:00<?, ?ba/s]

#50:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#52:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#53:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#54:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#55:   0%|          | 0/1 [00:00<?, ?ba/s]

#56:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#57:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#58:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#59:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#60:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#61:   0%|          | 0/1 [00:00<?, ?ba/s]

#62:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#63:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 366247
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 13022
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 21563
    })
})


In [8]:
print(dataset_filtered)
dataset_filtered.save_to_disk("/data/nicolasmaier/dataset/hf_clean_seq_dataset_3")


DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 366247
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 13022
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 21563
    })
})


Flattening the indices:   0%|          | 0/367 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/14 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/22 [00:00<?, ?ba/s]