### Notebook for transforming the dataset to mimic HumanEval

In [5]:
import pandas as pd
import re
import json

import os, sys
sys.path.append(os.path.abspath(os.path.join('..')))
from leetcode_env.utils import PySubmissionFormatter, RsSubmissionFormatter, metadata_from_slug
from leetcode_env.environment import LeetCodeEnv

from ast import literal_eval

In [2]:
def to_jsonl(dict_data, file_path):
    with open(file_path, 'a') as file:
        json_line = json.dumps(dict_data)
        file.write(json_line + os.linesep)

In [23]:
data = pd.read_csv('data/with_snippets/leetcode_hard_with_snippets_uncontaminated_cleaned_tests.csv')
# sample = data.sample(100, random_state=1337)

In [24]:
data['example_test_cases'] = data['example_test_cases'].apply(literal_eval)

### Python Dataset

In [25]:
function_name_regex = r"(?<=def\s)\w+"
lines = []

for ind, row in data.iterrows():
    task_id = row['question_slug']
    description = '\n    '.join(row['description'].strip().split('\n')).strip()
    #descripton = description.strip().replace('\n', '\n        ')
    docstring = f'''    """
    {description}
    """'''
    prompt = PySubmissionFormatter.to_humaneval(row['python3_snippet']).strip('\n') + '\n' + docstring + '\n'
    print(prompt)
    entry_point = re.search(function_name_regex, row['python3_snippet']).group(0)

    visible_tests = []
    for kwargs, expected in row['example_test_cases']:
        kwargs = {k: v.replace('null', 'None').replace('true', 'True').replace('false', 'False').replace('rue','True') for k, v in kwargs.items()}
        kwargs = ', '.join([f'{v}' for k, v in kwargs.items()])
        test = f'''    assert {entry_point}({kwargs}) == {expected}'''
        visible_tests.append(test)
    
    line = {
        'task_id': task_id,
        'prompt': prompt,
        'entry_point': task_id,
        'cannonical_solution': '',
        'test': '',
        'visible_tests': visible_tests
    }
    
    lines.append(line)


def minReverseOperations(n: int, p: int, banned: List[int], k: int) -> List[int]:
    """
    You are given an integer n and an integer p in the range [0, n - 1]. Representing a 0-indexed array arr of length n where all positions are set to 0's, except position p which is set to 1.
    You are also given an integer array banned containing some positions from the array. For the ith position in banned, arr[banned[i]] = 0, and banned[i] != p.
    You can perform multiple operations on arr. In an operation, you can choose a subarray with size k and reverse the subarray. However, the 1 in arr should never go to any of the positions in banned. In other words, after each operation arr[banned[i]] remains 0.
    Return an array ans where for each i from [0, n - 1], ans[i] is the minimum number of reverse operations needed to bring the 1 to position i in arr, or -1 if it is impossible.
    A subarray is a contiguous non-empty sequence of elements within an array.
    The values of ans[i] are ind

In [26]:
for dict_data in lines:
        to_jsonl(dict_data, 'data/humaneval/leetcode-hard-py-40-uncontaminated_tests.jsonl')

### Rust Dataset

In [27]:
function_name_regex = r"(?<=fn\s)\w+"
lines = []

for ind, row in data.iterrows():
    task_id = row['question_slug']
    comment = "\n".join([f"// {s}" for s in row['description'].strip().split("\n")])
    unformatted = comment + '\n' + row['rust_snippet']
    prompt = RsSubmissionFormatter.to_humaneval(comment + '\n' + row['rust_snippet']).strip('\n') + '\n'
    entry_point = re.search(function_name_regex, row['rust_snippet']).group(0)

    visible_tests = []
    for kwargs, expected in row['example_test_cases']:
        kwargs = {k: v.replace('null', 'None').replace('true', 'True').replace('false', 'False').replace('rue','True') for k, v in kwargs.items()}
        kwargs = ', '.join([f'{v}' for k, v in kwargs.items()])
        test = f'''    assert_eq!({entry_point}({kwargs}), {expected});'''
        visible_tests.append(test)

    
    line = {
        'task_id': task_id,
        'prompt': prompt,
        'entry_point': task_id,
        'cannonical_solution': '',
        'test': '',
        'visible_tests': visible_tests

    }
    
    lines.append(line)

In [28]:
for dict_data in lines:
        to_jsonl(dict_data, 'data/humaneval/leetcode-hard-rs-40-uncontaminated_tests.jsonl')