In [1]:
import json
from datasets import load_from_disk, load_dataset
from transformers import RobertaTokenizer
import os


In [2]:
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")


In [3]:
dataset_csn = load_dataset("code_search_net")
print(dataset_csn)


No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/data/nicolasmaier/huggingface_cache/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 1880853
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 100529
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 89154
    })
})


In [None]:
dataset_csn = dataset_csn.filter(
    lambda example: example["language"] in ["java", "python", "javascript"],
    num_proc=64,
)

print(dataset_csn)


In [None]:
def preprocess_examples(example):
    code = example["func_code_string"]
    doc = example["func_documentation_string"]

    new_code = code.replace(doc, "")

    return {"code": new_code}


dataset_csn = dataset_csn.map(
    preprocess_examples,
    num_proc=64,
)


In [None]:
def preprocess_examples(examples):
    codes = examples["code"]
    languages = examples["language"]

    contents = [
        f"{language} to summary\n{code}" for code, language in zip(codes, languages)
    ]
    model_inputs = tokenizer(contents)

    labels = tokenizer(examples["func_documentation_string"])
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


dataset_csn = dataset_csn.map(
    preprocess_examples,
    batched=True,
    batch_size=100,
    num_proc=64,
)


In [7]:
print(dataset_csn)
dataset_csn.save_to_disk("/data/nicolasmaier/dataset/hf_csn_1")


DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 990518
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 55568
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'code', 'input_ids', 'attention_mask

In [None]:
MAX_LENGTH = 505

dataset_csn_filtered = dataset_csn.filter(
    lambda example: len(example["input_ids"]) <= MAX_LENGTH, num_proc=64
)
print(dataset_csn_filtered)
dataset_csn_filtered = dataset_csn_filtered.filter(
    lambda example: len(example["labels"]) <= MAX_LENGTH,
    num_proc=64,
)
print(dataset_csn_filtered)


In [9]:
print(dataset_csn_filtered)
dataset_csn_filtered.save_to_disk("/data/nicolasmaier/dataset/hf_clean_csn_1")


DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 909090
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50975
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'code', 'input_ids', 'attention_mask

Loading cached processed dataset at /data/nicolasmaier/huggingface_cache/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27/cache-7c9f33ff27bec466.arrow
Loading cached processed dataset at /data/nicolasmaier/huggingface_cache/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27/cache-3e07b87b41575efc.arrow
Loading cached processed dataset at /data/nicolasmaier/huggingface_cache/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27/cache-6d5b0294cb4c1912.arrow


In [10]:
dataset_seq = load_from_disk("/data/nicolasmaier/dataset/hf_seq_dataset_3")
print(dataset_seq)


DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 454273
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 15326
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 26902
    })
})


In [None]:
def preprocess_examples(examples):
    codes = examples["code"]

    contents = [f"java to sequence\n{code}" for code in codes]
    model_inputs = tokenizer(contents)

    return model_inputs


dataset_seq = dataset_seq.map(
    preprocess_examples,
    batched=True,
    batch_size=100,
    num_proc=64,
)


In [12]:
print(dataset_seq)
dataset_seq.save_to_disk("/data/nicolasmaier/dataset/hf_seq_1")


DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 454273
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 15326
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 26902
    })
})


In [None]:
MAX_LENGTH = 505

dataset_seq_filtered = dataset_seq.filter(
    lambda example: len(example["input_ids"]) <= MAX_LENGTH, num_proc=64
)
print(dataset_seq_filtered)
dataset_seq_filtered = dataset_seq_filtered.filter(
    lambda example: len(example["labels"]) <= MAX_LENGTH,
    num_proc=64,
)
print(dataset_seq_filtered)
dataset_seq_filtered = dataset_seq_filtered.filter(
    lambda example: len(example["seq"]) > 10,
    num_proc=64,
)
print(dataset_seq_filtered)


In [14]:
print(dataset_seq_filtered)
dataset_seq_filtered.save_to_disk("/data/nicolasmaier/dataset/hf_clean_seq_1")


DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 366219
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 13021
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 21560
    })
})


Flattening the indices:   0%|          | 0/367 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/14 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/22 [00:00<?, ?ba/s]

In [15]:
print(dataset_seq_filtered)
print(dataset_csn_filtered)

print(tokenizer.decode(dataset_seq_filtered["train"][0]["input_ids"]))
print(tokenizer.decode(dataset_seq_filtered["train"][0]["labels"]))
print(tokenizer.decode(dataset_csn_filtered["train"][0]["input_ids"]))
print(tokenizer.decode(dataset_csn_filtered["train"][0]["labels"]))


DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 366219
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 13021
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'seq', 'labels'],
        num_rows: 21560
    })
})
DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 909090
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_