In [None]:
!pip install datasets tqdm

In [None]:
import gzip
import json
import pandas as pd

from collections import defaultdict
from datasets import load_dataset
from tqdm.auto import tqdm
from random import random, randint

ONE_STEP_OUPUT_CODE_TEMPLATES = [
    "Can you write a program in {lang} where it\n",
    "How would you implement a function in {lang} that\n",
    "Write a {lang} function for\n",
    "Can you create a {lang} program that\n",
    "Implement a function in {lang} to\n",
    "Write a {lang} script for\n",
    "How would you code a program in {lang} to\n",
    "Create a {lang} function for\n",
    "Write a {lang} program that can\n",
    "Can you implement a function in {lang} that\n",
]

ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [
    "Explain what the following {lang} code does\n",
    "Can you tell what is the following {lang} function doing\n",
    "Here you have a function in {lang}, explain what it does\n",
    "Make a summary of the following {lang} code\n",
    "Can you generate a brief explanation for the following {lang} code\n",
    "How would you explain what the following {lang} function does\n",
    "Can you generate the documentation for the following {lang} function\n",
    "Create a docstring for the following {lang} code\n",
    "Given the following {lang} function, write the documentation\n",
    "Write a docstring for the following {lang} function\n",
]


def remove_docstring(code_function):
    triple_quotes = '"""'
    lines = code_function.split("\n")

    c = lines[1].count(triple_quotes)
    # There is no docstring
    if c == 0:
        return code_function
    # One line dostring
    if c == 2:
        return "\n".join([lines[0]] + lines[2:])

    idx = 2
    while idx < len(lines) and triple_quotes not in lines[idx]:
        idx += 1

    return "\n".join([lines[0]] + lines[idx + 1 :])


lang = "Python 3"
data = defaultdict(list)
dataset = load_dataset("Nan-Do/codesearchnet-python")

for data_point in tqdm(dataset["train"]):
    code = data_point["original_string"]
    summary = data_point["summary"]
    data["SOURCE"].append("codesearchnet")
    # Generate code
    if random() > 0.5:
        idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)
        template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary
        data["INSTRUCTION"].append(template)
        data["RESPONSE"].append(code)
    # Generate summary
    else:
        # We are generating the docstring or a summary so we better remove it from
        # the function
        if random() < 0.9:
            code = remove_docstring(code)
        idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)
        template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code
        data["INSTRUCTION"].append(template)
        if idx <= 5:
            data["RESPONSE"].append(summary)
        else:
            data["RESPONSE"].append('"""' + summary + '"""')

df = pd.DataFrame(data=data)
df.to_parquet("dataset.parquet", row_group_size=100, engine="pyarrow", index=False)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import Dataset

ds = Dataset.from_parquet("dataset.parquet")
ds.push_to_hub("Nan-Do/open-assistant-codesearchnet-python")