## CodeT Code Generation Datasets

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/codet-data/Augment_CodeT_codegen.ipynb)

This notebook contains code to parse CodeT code generation prompt and solution data and modify to `(prompt, solution)` pairs outputted in a `.jsonl` file.

Requirements: `requests`

In [1]:
import json
from pathlib import Path
import requests
from typing import List, Tuple

In [2]:
DATA_FILES: List[str] = [
    "HumanEval_for_code_generation.jsonl",
    "mbpp_sanitized_for_code_generation.jsonl",
]

OUT_FILES: List[str] = [
    "HumanEval_codegen.jsonl",
    "mbpp_codegen.jsonl",
]

Path("data/augmented").mkdir(parents=True, exist_ok=True)

FILE_PATHS: List[Path] = [Path(f"data/{data_file}") for data_file in DATA_FILES]

OUT_PATHS: List[Path] = [Path(f"data/augmented/{out_file}") for out_file in OUT_FILES]

In [3]:
def download_file(filename: str):
    url = f"https://raw.githubusercontent.com/microsoft/CodeT/main/CodeT/data/dataset/{filename}"
    response = requests.get(url)
    with open(f"data/{filename}", "wb") as f:
        f.write(response.content)


for filename in DATA_FILES:
    download_file(filename)

We can find the docstring, use its contents as the instruction (prefixed with "Write a function corresponding to the docstring:") and then use the content prior to the docstring and the canonical solution as the response.

In [4]:
def get_docstring_indices(prompt_lines: List[str]) -> Tuple[int, int]:
    docstring_start, docstring_end = None, None

    for i, line in enumerate(prompt_lines):
        if not (line.strip().startswith('"""') or line.strip().startswith("'''")):
            continue
        if docstring_start:
            docstring_end = i
            break
        docstring_start = i

    if docstring_end:
        return docstring_start, docstring_end
    raise ValueError(f"No complete docstring found!\n{prompt_lines}")


def get_before(prompt_lines: List[str], before: int) -> List[str]:
    before_lines = prompt_lines[:before]
    return before_lines


def get_between(prompt_lines: List[str], start: int, end: int) -> List[str]:
    between_lines = prompt_lines[start:end]
    return between_lines

In [5]:
def get_request_and_solution(sample: dict) -> Tuple[List[str], List[str]]:
    prompt = sample["prompt"]
    prompt_lines = prompt.splitlines()

    docstring_start, docstring_end = get_docstring_indices(prompt_lines)

    # Extract prompt
    in_docstring = get_between(prompt_lines, docstring_start, docstring_end)
    if '"""' in in_docstring[0] or "'''" in in_docstring[0]:
        in_docstring[0] = in_docstring[0].replace('"""', "").replace("...", "").strip()
    request = "Write a Python function corresponding to the docstring: " + " ".join([p.strip() for p in in_docstring])

    # Extract solution
    before_docstring = get_before(prompt_lines, docstring_start)
    after_docstring = sample["canonical_solution"].splitlines()
    solution = before_docstring + after_docstring
    # Gets rid of consecutive empty lines
    solution = [v for i, v in enumerate(solution) if v != "" or v != solution[i - 1]]
    solution = "\n".join(solution)

    return request, solution

In [6]:
def process_file(file_path: Path, out_path: Path):
    lines = file_path.read_text().splitlines()
    samples = list(map(json.loads, lines))

    output = []
    for sample in samples:
        prompt, solution = get_request_and_solution(sample)
        output.append({"prompt": prompt, "solution": solution})

    with open(out_path, "w") as f:
        for sample in output:
            f.write(json.dumps(sample))
            f.write("\n")

In [7]:
for file_path, out_path in zip(FILE_PATHS, OUT_PATHS):
    process_file(file_path, out_path)

Display a sample output from HumanEval

In [8]:
sample = json.loads(Path("data/augmented/HumanEval_codegen.jsonl").read_text().splitlines()[0])

print("Prompt")
print(sample["prompt"])
print()
print("Solution")
print(sample["solution"])

Prompt
Write a Python function corresponding to the docstring: Check if in given list of numbers, are any two numbers closer to each other than given threshold.

Solution
from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False


Display a sample output from MBPP

In [9]:
sample = json.loads(Path("data/augmented/mbpp_codegen.jsonl").read_text().splitlines()[0])

print("Prompt")
print(sample["prompt"])
print()
print("Solution")
print(sample["solution"])

Prompt
Write a Python function corresponding to the docstring: ''' Write a function to find the shared elements from the given two lists.

Solution
def similar_elements(test_tup1, test_tup2):
  res = tuple(set(test_tup1) & set(test_tup2))
  return (res) 
