### Notebook for cleaning the Leetcode Question dataset
- No void functions
- No class implementations
- No external definitions
- No examples in description


In [37]:
import pandas as pd
import re
from utils import get_code_snippets, title_slug, PySubmissionFormatter, RsSubmissionFormatter

In [38]:
data = pd.read_csv('data/with_snippets/leetcode_hard_with_snippets.csv')

### Add the missing rows

In [39]:
raw_data = pd.read_csv('data/raw/leetcode_dataset.csv')
raw_data = raw_data[(raw_data['id'] == 446) | (raw_data['id'] == 1411)]

for ind, row in raw_data.iterrows():
    snippets = get_code_snippets(row['url'])
    for snip in snippets:
        lang = snip['lang']
        code = snip['code']
        raw_data.at[ind, f'{lang.lower()}_code_snippet'] = code

raw_data['title_slug'] = raw_data['url'].apply(lambda x: x.split('/')[-1])

In [40]:
with_missing = pd.concat([data, raw_data]).reset_index(drop=True)

### Remove rows that require external definitions

In [41]:
no_defs_inds = [ind for ind, row in with_missing.iterrows() if row['c++_code_snippet'].split(' ')[0] == 'class']
no_defs = with_missing.iloc[no_defs_inds]

### Remove rows that return none

In [42]:
ret_inds = [ind for ind, row in no_defs.iterrows() if '\"\"\"' in row['python3_code_snippet'].split('\n')[2]]
ret = no_defs.drop(ret_inds)

### Remove rows that require class implementation

In [43]:
function_name_regex = r"(?<=def\s)\w+"
impl_inds = [ind for ind, row in no_defs.iterrows()
             if re.search(function_name_regex, row['python3_code_snippet']).group(0) == '__init__']
no_impl = ret.drop(impl_inds)

### Remove Examples from descriptions

In [44]:
# delimiters = ["Example 1:",
#               "Example1:"
#               "Example:",
#               "Example :"
#               "Examples:",
#               "Valid Code Examples:",
#               "Evaluation Example:"]

def remove_examples(desc):
    lines = [l.strip() for l in desc.split('\n')]
    for i, line in enumerate(lines):
        if 'Example' in line:
            return '\n'.join(lines[:i])
    return desc

def remove_empty(desc: str):
    return '\n'.join(line for line in desc.split('\n') if line.strip())

for ind, row in no_impl.iterrows():
    res = remove_empty(remove_examples(row['description']))
    no_impl.at[ind, 'description'] = res

### Save

In [48]:
no_impl.to_csv('data/with_snippets/leetcode_hard_with_snippets_clean.csv')