In [1]:
from datasets import load_dataset
docs = load_dataset(f"mhhmm/leetcode-solutions-python", split="train")

Found cached dataset json (C:/Users/Tommaso/.cache/huggingface/datasets/mhhmm___json/mhhmm--leetcode-solutions-python-c6d2758e3a6cc905/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [2]:
import libcst as cst
import re
import random
from openbugger.bugger import Bugger, bugger_example
from time import perf_counter

In [3]:
import libcst as cst

from functools import lru_cache


def is_valid_python(code):
    try:
        cst.parse_module(code)
        return True
    except Exception:
        return False


def extract_python_blocks(input_string, start=0, intervals=None):
    if intervals is None:
        intervals = {"Python": [], "Non-Python": []}

    lines = input_string.split('\n')
    n = len(lines)

    for i in range(n):
        if not lines[i].strip() or lines[i].lstrip().startswith('#'):  # ignore empty start lines
            continue
        for j in range(n-1, i-1, -1):
            if not lines[j].strip() or lines[j].lstrip().startswith('#'):  # ignore empty end lines
                continue
            code = '\n'.join(lines[i:j+1])
            if is_valid_python(code):
                intervals["Python"].append((i+start, j+start))
                if i > 0:
                    intervals["Non-Python"].append((start, i+start-1))
                if j < n-1:
                    remaining = '\n'.join(lines[j+1:])
                    return extract_python_blocks(remaining, j+start+1, intervals)
                return intervals
    if n > 0 and (start, start+n-1) not in intervals["Non-Python"]:
        intervals["Non-Python"].append((start, start+n-1))
    return intervals



In [4]:
from babydragon.memory.frames.code_frame import CodeFrame
import polars as pl
import re

In [5]:
import libcst as cst

def cst_module(code):
    try:
        module = cst.parse_module(code)
        return module
    except Exception:
        return None


def extract_strings_from_intervals(input_string, intervals):
    lines = input_string.split('\n')
    non_python_text = ''
    for interval in intervals['Non-Python']:
        start, end = interval
        non_python_text += '\n'.join(lines[start:end + 1]) + '\n'
    return non_python_text.strip()

def extract_python_from_intervals(input_string, intervals):
    lines = input_string.split('\n')
    python_code = ''
    for interval in intervals['Python']:
        start, end = interval
        python_code += '\n'.join(lines[start:end + 1]) + '\n'
    #check there actually is python code
    if python_code.strip() == '':
        return None
    return python_code.strip()




In [6]:
df = pl.DataFrame(data={'code_with_problem':docs['code_with_problem'], 'code_with_data':docs['code_with_data']})
df.head(5)

code_with_problem,code_with_data
str,str
"""# Given an arr…","""# two-sum # Tw…"
"""# You are give…","""# add-two-numb…"
"""# Given a stri…","""# longest-subs…"
"""# Given two so…","""# median-of-tw…"
"""# Given a stri…","""# longest-pali…"


In [7]:
import os

#check that "leet_code_python_preprocessed.parquet" does not exists and if it does not create it
if not os.path.exists("leet_code_python_preprocessed.parquet"):

    df_filtered = df.with_columns(
        pl.col('code_with_problem').apply(lambda x: extract_python_from_intervals(x, extract_python_blocks(x)) ).alias('python_code'),
        pl.col('code_with_problem').apply(lambda x: extract_strings_from_intervals(x, extract_python_blocks(x))).alias('non_python_text'),
        pl.col('code_with_data').apply(lambda x: extract_python_from_intervals(x, extract_python_blocks(x))).alias('python_code_data'),
        pl.col('code_with_data').apply(lambda x: extract_strings_from_intervals(x, extract_python_blocks(x))).alias('non_python_text_data')
    )
    df_filtered = df_filtered.filter(pl.col('python_code').is_not_null())
    df_filtered.write_parquet("leet_code_python_preprocessed.parquet")
else:
    df_filtered = pl.read_parquet("leet_code_python_preprocessed.parquet")



In [8]:
df_filtered_with_cst = df_filtered.with_columns(
    pl.col("python_code").apply(lambda x: cst_module(x)).alias("cst_module").alias("python_code_cst"))

In [9]:
from babydragon.codemods.openbugger.bugger import Bugger
from babydragon.codemods.openbugger.bugs.controlflow import ForgettingToUpdateVariableTransformer, InfiniteWhileTransformer, gen_OffByKIndexTransformer, IncorrectExceptionHandlerTransformer,MissingArgumentTransformer,ReturningEarlyTransformer
from babydragon.codemods.openbugger.bugs.data import IncorrectVariableInitializationTransformer, VariableNameTypoTransformer, MutableDefaultArgumentTransformer, UseBeforeDefinitionTransformer
from babydragon.codemods.openbugger.bugs.logical import gen_ComparisonTargetTransfomer, ComparisonSwapTransformer
from babydragon.codemods.openbugger.bugs.type import IncorrectTypeTransformer, NonExistingMethodTransformer, SwapForTransformer



In [10]:
bug_dict = {"controlflow":[ForgettingToUpdateVariableTransformer, InfiniteWhileTransformer, gen_OffByKIndexTransformer(),IncorrectExceptionHandlerTransformer,MissingArgumentTransformer,ReturningEarlyTransformer],
            "data":[IncorrectVariableInitializationTransformer, VariableNameTypoTransformer, MutableDefaultArgumentTransformer, UseBeforeDefinitionTransformer],
            "logical":[gen_ComparisonTargetTransfomer(), ComparisonSwapTransformer],
            "type":[IncorrectTypeTransformer, NonExistingMethodTransformer, SwapForTransformer]}

bug_list = []
for bug_type in bug_dict:
    bug_list.extend(bug_dict[bug_type])

In [11]:
def apply_bug(module,bugtype):
    bugger=Bugger([bugtype])
    tainted = bugger.apply(module)
    # Check if the bug application was successful
    if not module.deep_equals(tainted):
        # Attempt to invert the transformation
        try:
            clean = bugger.apply(tainted, debug=True)
            if module.deep_equals(clean):
                #succesfull inversion
                bugs = bugger.get_bugs()
                name = bugtype.__name__
                out = {name+"_code": tainted.code,name +"_bugs": bugs}
                # out ={name+"_code": tainted.code}
                return [out]
            else:
                return None
        except:
            return None
    return None

def apply_bug_series(series):
    outs = []
    for i in series:
        out = apply_bug(i)
        outs.append(out)
    return outs

In [12]:
df_bugged = df_filtered_with_cst

In [13]:
import time
for bug in bug_list:
    start = time.time()
    bug_name = bug.__name__
    print("Bugging {bug_name}".format(bug_name=bug_name))
    df_bugged = df_bugged.with_columns(pl.col("python_code_cst").apply(lambda x: apply_bug(x, bugtype=bug)).list.first().alias(bug_name)).unnest(bug_name)
    #print time to bug and name of bug
    timetobug =  time.time() - start
    print("It took {timetobug}, for applying {bug_name}".format(timetobug=timetobug,bug_name=bug_name))
    df_bugged_filtered =df_bugged.select([pl.col(col) for col in df_bugged.columns if not col in ['python_code_cst']])
    df_bugged_filtered.write_parquet("leet_code_python_bugged.parquet")


Bugging ForgettingToUpdateVariableTransformer
