### Notebook for transforming the dataset to mimic HumanEval

In [1]:
import pandas as pd
import re
import json

import os, sys
sys.path.append(os.path.abspath(os.path.join('..')))
from leetcode_env.utils import PySubmissionFormatter, RsSubmissionFormatter

In [2]:
def to_jsonl(dict_data, file_path):
    with open(file_path, 'a') as file:
        json_line = json.dumps(dict_data)
        file.write(json_line + os.linesep)

In [3]:
data = pd.read_csv('data/leetcode_hard_with_snippets_clean.csv')
sample = data.sample(100, random_state=1337)

### Python Dataset

In [4]:
function_name_regex = r"(?<=def\s)\w+"
lines = []

for ind, row in sample.iterrows():
    task_id = row['title_slug']
    comment = '\n'.join([f'# {s}' for s in row['description'].strip().split('\n')])
    prompt = PySubmissionFormatter.to_humaneval(comment + '\n' + row['python3_code_snippet']).strip('\n') + '\n'
    entry_point = re.search(function_name_regex, row['python3_code_snippet']).group(0)
    
    line = {
        'task_id': task_id,
        'prompt': prompt,
        'entry_point': entry_point,
        'cannonical_solution': '',
        'test': ''
    }
    
    lines.append(line)


In [5]:
for dict_data in lines:
        to_jsonl(dict_data, 'data/humaneval/leetcode-hard-py-100-sample.jsonl')

### Rust Dataset

In [6]:
function_name_regex = r"(?<=fn\s)\w+"
lines = []

for ind, row in sample.iterrows():
    task_id = row['title_slug']
    comment = "\n".join([f"// {s}" for s in row['description'].strip().split("\n")])
    unformatted = comment + '\n' + row['rust_code_snippet']
    prompt = RsSubmissionFormatter.to_humaneval(comment + '\n' + row['rust_code_snippet']).strip('\n') + '\n'
    entry_point = re.search(function_name_regex, row['rust_code_snippet']).group(0)
    
    line = {
        'task_id': task_id,
        'prompt': prompt,
        'entry_point': task_id,
        'cannonical_solution': '',
        'test': ''
    }
    
    lines.append(line)

In [7]:
for dict_data in lines:
        to_jsonl(dict_data, 'data/humaneval/leetcode-hard-rs-100-sample.jsonl')