# Sample 300 instances from APPS Test Dataset

In [1]:
import json
import random

In [None]:
path = 'test.jsonl' # downloaded from https://huggingface.co/datasets/codeparrot/apps 
with open(path) as f:
    prompt_template = [json.loads(line) for line in f]

In [None]:
def sample_instances_of_(difficulty, dataset, N=100):
    filtered_dataset = filter(lambda row: row['difficulty'] == difficulty, dataset)
    return random.sample(list(filtered_dataset), N)

In [None]:
introductory = sample_instances_of_('introductory', prompt_template)
interview = sample_instances_of_('interview', prompt_template)
competition = sample_instances_of_('competition', prompt_template)

In [12]:
filtered_path = 'APPS.jsonl'

with open(filtered_path, 'w') as f:
    for item in introductory + interview + competition:
        f.write(json.dumps(item) + '\n')

## Understanding Benchmarks

In [2]:
def read_jsonl(path):
    with open(path) as f:
        data = [json.loads(line) for line in f]
    return data

def print_single_sample(samples):
    sample = samples[0]
    print(sample.keys())
    for key in sample:
        print(f'>>>>>>{key}\n{sample[key]}\n')

In [3]:
humaneval_path = 'HumanEval.jsonl'
he_data = read_jsonl(humaneval_path)
print_single_sample(he_data)

dict_keys(['task_id', 'prompt', 'entry_point', 'canonical_solution', 'test'])
>>>>>>task_id
HumanEval/0

>>>>>>prompt
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """


>>>>>>entry_point
has_close_elements

>>>>>>canonical_solution
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False


>>>>>>test


METADATA = {
    'author': 'jt',
    'dataset': 'test'
}


def check(candidate):
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.

In [8]:
HE_base_prompt = '../prompt/HumanEval_base.txt'
with open(HE_base_prompt) as f:
    prompt_template = f.read()

HE_prompts = dict()
for problem in he_data:
    id = problem['task_id']
    prompt = problem['prompt']
    HE_prompts[id] = prompt_template.format(problem=prompt)

In [10]:
with open('HumanEval_prompts.json', 'w') as f:
    json.dump(HE_prompts, f, indent=4)

In [11]:
apps_path = 'APPS.jsonl'
apps_data = read_jsonl(apps_path)
print_single_sample(apps_data)

dict_keys(['id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'])
>>>>>>id
4693

>>>>>>question
You are given two integers A and B as the input. Output the value of A + B.
However, if A + B is 10 or greater, output error instead.

-----Constraints-----
 - A and B are integers.
 - 1 ≤ A, B ≤ 9

-----Input-----
Input is given from Standard Input in the following format:
A B

-----Output-----
If A + B is 10 or greater, print the string error (case-sensitive); otherwise, print the value of A + B.

-----Sample Input-----
6 3

-----Sample Output-----
9


>>>>>>solutions
["# A - Restricted\n# https://atcoder.jp/contests/abc063/tasks/abc063_a\n\nA, B = list(map(int, input().split()))\n\nresult = A + B\nif result >= 10:\n    print('error')\nelse:\n    print(result)\n", "l = input().split()\n\nans = int(l[0]) + int(l[1])\n\nif 10 <= ans:\n   print('error')\nelse:\n   print(ans)", "a, b = map(int, input().split())\nif a+b >= 10:\n  print(\"error\")\nelse:\n  print(a+

In [None]:
for sample in apps_data:
    if sample['starter_code']:
        print(f'Non-empty Starter:\n{sample["starter_code"]}\n')

# starter_code doesn't seem to help for our samples

Non-empty Starter:

# Enter your code here. Read input from STDIN. Print output to STDOUT

Non-empty Starter:

class Solution:
    def spiralOrder(self, matrix: List[List[int]]) -> List[int]:
        



In [12]:
APPS_base_prompt = '../prompt/APPS_base.txt'
with open(APPS_base_prompt) as f:
    prompt_template = f.read()

APPS_prompts = dict()
for problem in apps_data:
    id = problem['id']
    prompt = problem['question']
    APPS_prompts[id] = prompt_template.format(problem=prompt)

In [14]:
with open('APPS_prompts.json', 'w') as f:
    json.dump(APPS_prompts, f, indent=4)