In [1]:
from datasets import load_dataset
import json
import polars as pl

In [2]:
import libcst as cst
import re
import random
from openbugger.bugger import Bugger, bugger_example
from time import perf_counter

In [3]:
all_bugs_names = ['ReturningEarlyTransformer', 'SwapForTransformer', 'VariableNameTypoTransformer', 'ForgettingToUpdateVariableTransformer', 'MutableDefaultArgumentTransformer', 'UseBeforeDefinitionTransformer', 'OffByKIndexTransformer', 'ComparisonSwapTransformer', 'InfiniteWhileTransformer', 'MissingArgumentTransformer', 'IncorrectExceptionHandlerTransformer', 'IncorrectTypeTransformer', 'ComparisonTargetTransformer', 'IncorrectVariableInitializationTransformer', 'NonExistingMethodTransformer']

In [4]:
import libcst as cst
import libcst.matchers as m
from black import format_str, FileMode

class RemoveComments(cst.CSTTransformer):
    def leave_Comment(self, original_node, updated_node):
        return cst.RemovalSentinel.REMOVE

def remove_comments_and_lint(module_str: str) -> str:
    module_cst = cst.parse_module(module_str)
    module_cst_no_comments = module_cst.visit(RemoveComments())
    code_str_no_comments = module_cst_no_comments.code
    # Lint the code using black
    linted_str = format_str(code_str_no_comments, mode=FileMode())
    return linted_str

def compare_modules(module1: str, module2: str) -> bool:
    # Remove comments and lint both modules
    module1_clean = remove_comments_and_lint(module1)
    module2_clean = remove_comments_and_lint(module2)
    # Parse the cleaned code strings back to libcst.Module for deep comparison
    module1_cst_clean = cst.parse_module(module1_clean)
    module2_cst_clean = cst.parse_module(module2_clean)
    return module1_cst_clean.deep_equals(module2_cst_clean)

In [5]:
import libcst as cst
import re
from functools import lru_cache

def is_valid_python(code):
    if len(code) == 0:
        return False
    try:
        cst.parse_module(code)
        return True
    except Exception:
        return False

def is_single_word_line(line):
    words = re.findall(r'\b\w+\b', line)
    return len(words) == 1

def find_enclosed_newlines(input_string):
    enclosed_newlines = []
    enclosing_strings = []
    string_patterns = [r'"[^"\\]*(\\.[^"\\]*)*"', r"'[^'\\]*(\\.[^'\\]*)*'", r'"""(.*?)"""', r"'''(.*?)'''"]
    
    for pattern in string_patterns:
        for match in re.finditer(pattern, input_string, re.DOTALL):
            start, end = match.span()
            inner_newlines = [i for i in range(start, end) if input_string[i] == '\n']
            enclosed_newlines.extend(inner_newlines)
            enclosing_strings.extend([match.group()] * len(inner_newlines))
            
    return enclosed_newlines, enclosing_strings


def split_input_with_respect_to_enclosed_newlines(input_string):
    enclosed_newlines, _ = find_enclosed_newlines(input_string)
    lines = []
    line_start = 0
    for i, char in enumerate(input_string):
        if char == '\n' and i not in enclosed_newlines:
            lines.append(input_string[line_start:i])
            line_start = i+1
    lines.append(input_string[line_start:])  # add the last line
    return lines



def cst_module(code):
    try:
        module = cst.parse_module(code)
        return module
    except Exception:
        return None


def extract_python_blocks(input_string, start=0, intervals=None):
    if intervals is None:
        intervals = {"Python": [], "Non-Python": []}

    lines = input_string.split('\n')
    n = len(lines)

    for i in range(n):
        if not lines[i].strip() or lines[i].lstrip().startswith('#') or is_single_word_line(lines[i]):  # ignore empty start lines
            continue
        for j in range(n-1, i-1, -1):
            if not lines[j].strip() or lines[j].lstrip().startswith('#') or is_single_word_line(lines[j]):  # ignore empty end lines
                continue
            code = '\n'.join(lines[i:j+1])
            if is_valid_python(code):
                intervals["Python"].append((i+start, j+start))
                if i > 0:
                    intervals["Non-Python"].append((start, i+start-1))
                if j < n-1:
                    remaining = '\n'.join(lines[j+1:])
                    return extract_python_blocks(remaining, j+start+1, intervals)
                return intervals
    if n > 0 and (start, start+n-1) not in intervals["Non-Python"]:
        intervals["Non-Python"].append((start, start+n-1))
    return intervals

In [6]:



def extract_strings_from_intervals(input_string, intervals):
    lines = input_string.split('\n')
    non_python_text = ''
    for interval in intervals['Non-Python']:
        start, end = interval
        non_python_text += '\n'.join(lines[start:end + 1]) + '\n'
    return non_python_text.strip()

def extract_python_from_intervals(input_string, intervals):
    lines = input_string.split('\n')
    python_code_list = []
    if len(intervals['Python']) == 0:
        return None
    for interval in intervals['Python']:
        python_code = ''
        start, end = interval
        python_code += '\n'.join(lines[start:end + 1]) + '\n'
        if python_code.strip() == '':
            python_code_list.append(None)
        else:
            python_code_list.append(python_code.strip())
        
    #check there actually is python code
    
    return python_code_list

In [7]:
def check_markdown(before, after):
    is_python_markdown = before.strip().startswith('```python') and after.strip().startswith('```')
    is_plain_markdown = before.strip() == '```' and after.strip() == '```' and not is_python_markdown
    is_other_markdown = before.strip().startswith('```') and not is_python_markdown and not is_plain_markdown
    is_no_markdown = not before.strip().startswith('```') and not after.strip().startswith('```') and not is_other_markdown
    is_unclosed = (before.strip().startswith('```') != after.strip().startswith('```'))  # Changed this line

    return is_python_markdown, is_plain_markdown, is_other_markdown, is_no_markdown, is_unclosed

def detect_markdown_blocks(input_string, intervals):
    lines = input_string.split('\n')
    markdown_info = {}
    i=0
    for interval in intervals['Python']:
        start, end = interval
        before = lines[start-1] if start-1 >= 0 else ''
        after = lines[end+1] if end+1 < len(lines) else ''
        # If the block is at the end of the string and doesn't have '```' after it,
        # don't assume it's enclosed in markdown
        if end == len(lines) - 1 and not after.strip().startswith('```'):
            after = ''
        markdown_info[interval] = {'is_python_markdown': False, 'is_plain_markdown': False, 'is_other_markdown': False, 'is_no_markdown': False, 'is_unclosed': False}
        markdown_info[interval]['is_python_markdown'], markdown_info[interval]['is_plain_markdown'], markdown_info[interval]['is_other_markdown'], markdown_info[interval]['is_no_markdown'], markdown_info[interval]['is_unclosed'] = check_markdown(before, after)
        i+=1
    return markdown_info


def uniform_markdown(input_string, markdown_info):
    lines = input_string.split('\n')
    for interval, info in markdown_info.items():
        start, end = interval
        is_python_markdown, is_plain_markdown, is_other_markdown, is_no_markdown, is_unclosed = info.values()

        # Convert all blocks to Python markdown
        if is_plain_markdown or is_other_markdown or is_no_markdown or is_unclosed:
            # Add new lines if the block was originally without markdown or unclosed
            if is_no_markdown or is_unclosed:
                if start - 1 >= 0:
                    lines[start - 1] = lines[start - 1] + '\n' + '```python'
                else:
                    lines.insert(0, '```python')

                if end + 1 < len(lines):
                    lines[end + 1] = '```' + '\n' + lines[end + 1]
                else:
                    lines.append('```')
            else:
                if start - 1 >= 0:
                    lines[start - 1] = '```python'
                if end + 1 < len(lines):
                    lines[end + 1] = '```'
    return '\n'.join(lines)


In [8]:
def check_for_original_code(message,original_code):
    python_ids = extract_python_blocks(message)
    python_code = extract_python_from_intervals(message, python_ids)
    if python_code is None:
        return False
    for code in python_code:
        comparison = compare_modules(code,original_code)
        if comparison:
            return True
    return False




In [9]:
def check_and_modify(message, original_code,bug,question=False):
    if not isinstance(message, str):
        dict_out = {"message": None, "status": None}
        return None
    added_string = "Here is the corrected code: " if not question else "Here is my code: "
    added_name = "_assistant_checked" if not question else "_user_checked"
    python_ids = extract_python_blocks(message)
    python_code = extract_python_from_intervals(message, python_ids)
    max_length_code = max(python_code, key=len, default=None) if python_code is not None else None
    
    if check_for_original_code(message, original_code):
        out = message
        status = "found"
    else:
        newline = chr(10)
        
        if max_length_code is None:
            # If no Python code is found in the message, append the original code
            out = message + newline + newline+ added_string + newline  + original_code
            status = "nocode"
        else:
            # If non matching Python code is found in the message, replace the longest one with the original code
            out = message.replace(max_length_code, original_code)
            status = "replaced"
    code_blocks = extract_python_blocks(out)
    md_blocks = detect_markdown_blocks(out, code_blocks)

    cleaned_out = uniform_markdown(out, md_blocks)
    dict_out = {bug+added_name: cleaned_out, bug+added_name+"_status": status}
    return [dict_out]


In [10]:
df = pl.read_parquet("df_complete_leetcode_dirty.parquet")

In [11]:
#now I have to check that all bugs questions contain the corresponding bugged code
# and that all the debugging instructions contain the corresponding debugged code
i = 4
code_example = df["code"][i]
bug_code_example = df["ReturningEarlyTransformer_code"][i]
question_example = df["ReturningEarlyTransformer_user"][i]
answer_example = df["ReturningEarlyTransformer_assistant"][i]

In [12]:
bug = all_bugs_names[0]

In [13]:
import os
import time
#check os if the file exists
if os.path.exists("df_complete_leetcode_corrected.parquet"):
    updated_df = pl.read_parquet("df_complete_leetcode_corrected.parquet")
else:
    updated_df = df
for bug in all_bugs_names :
    if bug+"_assistant_checked" not in updated_df.columns:
        print("correcting bug "+bug)
        start = time.time()
        updated_df = updated_df.with_columns(pl.struct(["code", bug+"_user"]).apply(lambda row: check_and_modify(row[bug+"_user"],row["code"],bug=bug,question=True)).list.first().alias(bug+"_user_corrected")).unnest(bug+"_user_corrected")
        updated_df = updated_df.with_columns(pl.struct(["code", bug+"_assistant"]).apply(lambda row: check_and_modify(row[bug+"_assistant"],row["code"],bug=bug,question=False)).list.first().alias(bug+"_assistant_corrected")).unnest(bug+"_assistant_corrected")
        updated_df.write_parquet("df_complete_leetcode_corrected.parquet")
        end = time.time()
        print("time for bug "+bug+": "+str(end-start))
    else:
        print("bug "+bug+" already corrected")


bug ReturningEarlyTransformer already corrected
bug SwapForTransformer already corrected
bug VariableNameTypoTransformer already corrected
bug ForgettingToUpdateVariableTransformer already corrected
bug MutableDefaultArgumentTransformer already corrected
bug UseBeforeDefinitionTransformer already corrected
bug OffByKIndexTransformer already corrected
bug ComparisonSwapTransformer already corrected
bug InfiniteWhileTransformer already corrected
bug MissingArgumentTransformer already corrected
bug IncorrectExceptionHandlerTransformer already corrected
bug IncorrectTypeTransformer already corrected
bug ComparisonTargetTransformer already corrected
bug IncorrectVariableInitializationTransformer already corrected
bug NonExistingMethodTransformer already corrected


In [14]:
for bug in all_bugs_names:
    print(updated_df[bug+'_user_checked_status'].value_counts())

shape: (3, 2)
┌───────────────────────────────────┬────────┐
│ ReturningEarlyTransformer_user_c… ┆ counts │
│ ---                               ┆ ---    │
│ str                               ┆ u32    │
╞═══════════════════════════════════╪════════╡
│ nocode                            ┆ 2318   │
│ replaced                          ┆ 1      │
│ null                              ┆ 29     │
└───────────────────────────────────┴────────┘
shape: (3, 2)
┌───────────────────────────────────┬────────┐
│ SwapForTransformer_user_checked_… ┆ counts │
│ ---                               ┆ ---    │
│ str                               ┆ u32    │
╞═══════════════════════════════════╪════════╡
│ replaced                          ┆ 1      │
│ nocode                            ┆ 1171   │
│ null                              ┆ 1176   │
└───────────────────────────────────┴────────┘
shape: (2, 2)
┌───────────────────────────────────┬────────┐
│ VariableNameTypoTransformer_user… ┆ counts │
│ ---             

In [15]:
for bug in all_bugs_names:
    print(updated_df[bug+'_assistant_checked_status'].value_counts())

shape: (4, 2)
┌───────────────────────────────────┬────────┐
│ ReturningEarlyTransformer_assist… ┆ counts │
│ ---                               ┆ ---    │
│ str                               ┆ u32    │
╞═══════════════════════════════════╪════════╡
│ null                              ┆ 29     │
│ replaced                          ┆ 597    │
│ nocode                            ┆ 295    │
│ found                             ┆ 1427   │
└───────────────────────────────────┴────────┘
shape: (4, 2)
┌───────────────────────────────────┬────────┐
│ SwapForTransformer_assistant_che… ┆ counts │
│ ---                               ┆ ---    │
│ str                               ┆ u32    │
╞═══════════════════════════════════╪════════╡
│ found                             ┆ 669    │
│ null                              ┆ 1176   │
│ replaced                          ┆ 448    │
│ nocode                            ┆ 55     │
└───────────────────────────────────┴────────┘
shape: (4, 2)
┌─────────────────

In [16]:
alternative_begins = [
    "Embarking on my coding journey,",
    "Dipping my toes into the world of programming,",
    "Venturing into the realm of software development,",
    "Initiating my journey with code,",
    "As I navigate the landscape of coding,",
    "Programming, a new frontier for me,",
    "My initiation into the world of scripts involves,",
    "Just commencing my voyage in software creation,",
    "As a newcomer to coding,",
    "In the early stages of my exploration into programming languages,",
    "Beginning my adventure in the realm of code,",
    "Stepping into the programming world,",
    "Starting to unravel the intricacies of Python,",
    "In the midst of learning Python programming,",
    "As an apprentice of Python,",
    "Launching my Python expedition,",
    "Starting my odyssey with Python,",
    "Embarking on my adventure in Python,",
    "In the infancy of my Python experience,",
    "Taking my first steps in Python,"
]

stereotypes = ["As a beginner,","As a beginner programmer,"]


In [17]:
import random

def substitute_beginning(string, stereotypes, alternative_begins):
    for stereotype in stereotypes:
        if stereotype in string:
            string = string.replace(stereotype, random.choice(alternative_begins))
    return string




In [18]:
diversified_df = updated_df
for bug in all_bugs_names :
    diversified_df = diversified_df.with_columns(
        pl.col(bug+'_user_checked')
        .apply(lambda x: substitute_beginning(x,stereotypes=stereotypes,alternative_begins=alternative_begins),strategy="threading",return_dtype=pl.Utf8)
        .alias(bug+'_user_checked_diversified')
    )

In [19]:
diversified_df.write_parquet("df_complete_leetcode_corrected_diversified.parquet")

In [20]:
diversified_df.columns

['problem',
 'code',
 'cst',
 'ReturningEarlyTransformer_code',
 'SwapForTransformer_code',
 'VariableNameTypoTransformer_code',
 'ForgettingToUpdateVariableTransformer_code',
 'MutableDefaultArgumentTransformer_code',
 'UseBeforeDefinitionTransformer_code',
 'OffByKIndexTransformer_code',
 'ComparisonSwapTransformer_code',
 'InfiniteWhileTransformer_code',
 'MissingArgumentTransformer_code',
 'IncorrectExceptionHandlerTransformer_code',
 'IncorrectTypeTransformer_code',
 'ComparisonTargetTransformer_code',
 'IncorrectVariableInitializationTransformer_code',
 'NonExistingMethodTransformer_code',
 'ReturningEarlyTransformer_bugs',
 'SwapForTransformer_bugs',
 'VariableNameTypoTransformer_bugs',
 'ForgettingToUpdateVariableTransformer_bugs',
 'MutableDefaultArgumentTransformer_bugs',
 'UseBeforeDefinitionTransformer_bugs',
 'OffByKIndexTransformer_bugs',
 'ComparisonSwapTransformer_bugs',
 'InfiniteWhileTransformer_bugs',
 'MissingArgumentTransformer_bugs',
 'IncorrectExceptionHandlerTra

In [21]:
system_prompts = [
    "Firstly, delineate a comprehensive overview of the purpose and functionality of the code. Proceed to examine each line and elucidate how it contributes to the overall logic. Isolate complex segments and expound upon them in layman's terms, ensuring to clearly describe the role of each variable, function, or key element. Don't forget to identify any areas that might commonly be misconstrued or could be a source of error, explaining their resolution if required. Finally, offer words of motivation and tie the code's workings to real-world applications where possible.",
    
    "Begin with establishing the primary role of the code. From there, break down each segment of the code, articulating its role in the grand scheme of the functionality. Special emphasis should be given to significant variables, functions, or other crucial components, explaining their purpose in simple terms. Call attention to any usual mistakes or bugs that might occur in this context, providing an understanding of their cause and potential fix. Encourage the user by confirming that understanding complex codes is part of the learning journey and connect the theory to real-world use-cases.",

    "Commence by offering a broad understanding of the code's purpose. Then, piece-by-piece, explain the importance and role of each line of code, particularly highlighting the significance of critical elements such as variables or functions. In the process, identify common pitfalls or bugs, elucidate their cause and solution. Encourage the user by assuring them that grappling with such problems is normal during learning. Lastly, relate the concepts or operations to broader real-life applications or programming principles.",

    "Set the scene by explaining the primary function of the code. Following that, dissect each part of the code and convey its contribution to the overall operation. Pay attention to the explanation of key elements like variables or functions, making sure their purpose is understood in the simplest of terms. Identify any common mistakes, bugs, or issues, provide insight into their root cause, and explain how to fix them. Reiterate to the user that facing challenges is part of the learning curve. Conclude by illustrating how the code's workings link to real-world applications.",

    "Start by setting the context with an overview of the code's main role. Then, go line by line, detailing the functionality of each segment, with special focus on critical elements like variables or functions. Explain any common errors or issues that could arise in the given context, giving an understanding of their origin and how to address them. Affirm the user that tackling such complexities is part of the journey of learning code. Finish off by connecting the code's concept to practical applications in the real world.",

    "Kick-start your response with an overarching explanation of what the code aims to achieve. Following that, dissect the code into manageable chunks, laying bare how each segment contributes to the total functionality. Extra focus should be given to crucial components such as variables and functions, simplifying their role for the user. Shine a light on any common errors or bugs that could occur and illustrate their resolutions. Provide reassurance to the user that grappling with such issues is part and parcel of the learning journey. To wrap up, link the workings of the code to practical real-world scenarios.",

    "Start with providing a bird's eye view of the code's main purpose. Then, systematically break down its operation, explaining how each part plays a role in the overall scheme. Be sure to clarify the function of key variables or operations, simplifying the explanation as needed. Identify any likely errors or issues, explaining their source and how to fix them. Remind the user that these challenges are a normal part of the learning process. Finally, connect the principles used in the code to real-world examples where possible.",

    "Initiate your response by providing a broad description of the code's purpose. Progressively delve into each segment, clarifying its significance and role in the overall operation. Be meticulous when explaining key variables or functions, ensuring their purpose is easily grasped. Point out any areas that might cause confusion or result in bugs, discussing how to tackle these issues. Offer reassurance to the user that wrestling with these challenges is a stepping stone in the learning process. Conclude by showing how the code's concept or principles can be applied in the real world.",

    "Embark on your explanation with a comprehensive synopsis of the code's goal. Then proceed to analyse each part of the code, describing its function and how it contributes to the bigger picture. Make a special effort to simplify the role of critical components such as variables and functions. Highlight any potential pitfalls or bugs, explaining their cause and remedy. Remember to motivate the user, assuring them that encountering such complexities is normal when learning to code. As a finishing touch, draw a connection between the code's workings and their practical applications.",

    "Begin by outlining the overall aim of the code. Then take a deeper dive into each section, elaborating on its role in achieving the final outcome. Pay particular attention to key elements like variables and functions, ensuring their role is demystified. Shed light on any common bugs or errors that might occur, offering guidance on how to solve them. Reinforce the idea to the user that confronting such issues is part and parcel of the learning journey. Lastly, connect the dots between the code's operation and its real-world applications."
]




In [88]:
import random
def add_system_message(message):
    return "<|im_start|>system\n"+message+"<|im_end|>"
def add_user_message(message):
    return "<|im_start|>user\n"+message+"<|im_end|>"
def add_assistant_message(message):
    return "<|im_start|>assistant\n"+message+"<|im_end|>"

def chatml_qa(user_string, assistant_string, system_prompts,bug):
    if not isinstance(user_string, str):
        return None

    # Randomly select a system prompt
    system_message = random.choice(system_prompts)
    
    conversation = [add_system_message(system_message), add_user_message(user_string), add_assistant_message(assistant_string)]
    results = {bug+"_chat_ml": '\n'.join(conversation), bug+"_system": system_message}
    return [results]


In [23]:
diversified_df = pl.read_parquet("df_complete_leetcode_corrected_diversified.parquet")

In [89]:


chat_ml_df = diversified_df
for bug in all_bugs_names :
    struct_schema = pl.DataFrame(data={"dictdata" : {bug+"_chat_ml": "", bug+"_system": "" }})["dictdata"].struct.schema
    return_dtype=pl.List(inner=pl.Struct(fields=struct_schema))
    chat_ml_df = chat_ml_df.with_columns(
        pl.struct([bug+'_user_checked_diversified', bug+'_assistant_checked'])
        .apply(lambda row: chatml_qa(row[bug+"_user_checked_diversified"],row[bug+'_assistant_checked'],system_prompts=system_prompts,bug=bug),strategy="threading",skip_nulls=True,return_dtype=return_dtype)
        .list.first()
        .alias(bug+'_chat')
    ).unnest(bug+'_chat')

In [24]:


# chat_ml_df = diversified_df
# for bug in all_bugs_names :
#     chat_ml_df = chat_ml_df.with_columns(
#         pl.struct([bug+'_user_checked_diversified', bug+'_assistant_checked'])
#         .apply(lambda row: chatml_qa(row[bug+"_user_checked_diversified"],row[bug+'_assistant_checked'],system_prompts=system_prompts),strategy="threading",return_dtype=pl.Utf8)
#         .alias(bug+'_chat_ml')
#     )

In [90]:
print(chat_ml_df["ReturningEarlyTransformer_chat_ml"][50])

<|im_start|>system
Start with providing a bird's eye view of the code's main purpose. Then, systematically break down its operation, explaining how each part plays a role in the overall scheme. Be sure to clarify the function of key variables or operations, simplifying the explanation as needed. Identify any likely errors or issues, explaining their source and how to fix them. Remind the user that these challenges are a normal part of the learning process. Finally, connect the principles used in the code to real-world examples where possible.<|im_end|>
<|im_start|>user
Initiating my journey with code, I'm having trouble understanding the code for the n-queens puzzle. Can you explain what the code is trying to achieve? Also, how does the `isSafe()` function contribute to the solution? Finally, what might be causing any issues with the code?

Here is my code: 
```python
def solveNQueens(n):
    def isSafe(board, row, col):
        for i in range(col):
            if board[row][i] == 'Q':

In [172]:
chat_ml_df.write_parquet("bugged_leetcode_all_steps.parquet")

In [None]:
import tiktoken
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

def tokens(string):
    return tokenizer.encode(string)
def count_tokens(string):
    return len(tokenizer.encode(string))

In [142]:
df_only_bug = pl.DataFrame(schema={"conversation": pl.Utf8, "system": pl.Utf8, "question": pl.Utf8, "answer": pl.Utf8, "answer_status": pl.Utf8, "original_code": pl.Utf8, "bugged_code":pl.Utf8, "bug_type":pl.Utf8})
for bug in all_bugs_names:
    temporary_df = chat_ml_df.select(
        pl.col(bug+"_chat_ml").alias("conversation"),
        pl.col(bug+"_system").alias("system"),
        pl.col(bug+"_user_checked_diversified").alias("question"),
        pl.col(bug+"_assistant_checked").alias("answer"),
        pl.col(bug+"_assistant_checked_status").alias("answer_status"),
        pl.col("code").alias("original_code"),
        pl.col(bug+"_code").alias("bugged_code"),
        pl.Series([bug]*len(chat_ml_df)).alias("bug_type")
    ).filter(pl.col("conversation").is_not_null())
    df_only_bug = pl.concat([df_only_bug,temporary_df],how = "vertical")
    
   
df_only_bug = df_only_bug.with_columns(pl.col("conversation").apply(lambda x: count_tokens(x)).alias("conversation_num_tokens"))
df_only_bug = df_only_bug.with_columns(pl.col("conversation").apply(lambda x: tokens(x)).alias("conversation_tokens"))

In [173]:
df_only_bug.write_parquet("bugged_leetcode_all_conversations.parquet")


In [174]:
df_only_no_replaced = df_only_bug.filter(pl.col("answer_status").is_in(["found", "nocode"]))
df_only_no_replaced.shape

(14113, 10)

In [175]:
df_only_no_replaced.write_parquet("bugged_leetcode_no_replaced.parquet")

In [153]:
from babydragon.models.embedders.ada2 import OpenAiEmbedder
import openai
openai.api_key="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"
embedder = OpenAiEmbedder()
conversations = df_only_bug["conversation"].to_list()


In [157]:
embeddings = embedder.batch_embed(conversations, batch_size=5)

babydragon.utils.main_logger - INFO - Batch 1 of 4041
babydragon.utils.main_logger - INFO - Embedding batch 1 took 0.6440467834472656 seconds
babydragon.utils.main_logger - INFO - Batch 2 of 4041
babydragon.utils.main_logger - INFO - Embedding batch 2 took 0.4093821048736572 seconds
babydragon.utils.main_logger - INFO - Batch 3 of 4041
babydragon.utils.main_logger - INFO - Embedding batch 3 took 0.4184403419494629 seconds
babydragon.utils.main_logger - INFO - Batch 4 of 4041
babydragon.utils.main_logger - INFO - Embedding batch 4 took 0.41870594024658203 seconds
babydragon.utils.main_logger - INFO - Batch 5 of 4041
babydragon.utils.main_logger - INFO - Embedding batch 5 took 0.4174618721008301 seconds
babydragon.utils.main_logger - INFO - Batch 6 of 4041
babydragon.utils.main_logger - INFO - Embedding batch 6 took 0.3941681385040283 seconds
babydragon.utils.main_logger - INFO - Batch 7 of 4041
babydragon.utils.main_logger - INFO - Embedding batch 7 took 0.5733458995819092 seconds
babyd

In [159]:
embeddings_df = pl.DataFrame(data={"conversation_embeddings": embeddings})

In [161]:
df_only_bug_embeddings = pl.concat([df_only_bug,embeddings_df],how="horizontal")

In [163]:
df_only_bug_embeddings.write_parquet("bugged_leetcode_all_conversations_with_embeddings.parquet")

In [165]:
df_only_bug_no_replaced = df_only_bug_embeddings.filter(pl.col("answer_status").is_in(["found", "nocode"]))
df_only_bug_no_replaced.columns

['conversation',
 'system',
 'question',
 'answer',
 'answer_status',
 'original_code',
 'bugged_code',
 'bug_type',
 'conversation_num_tokens',
 'conversation_tokens',
 'conversation_embeddings']

In [176]:
df_only_bug_no_replaced.write_parquet("bugged_leetcode_no_replaced_with_embeddings.parquet")

In [178]:
df_only_bug["conversation_num_tokens"].shape

(20205,)

In [179]:
df_only_bug_no_replaced["conversation_num_tokens"].shape

(14113,)

In [51]:
print(df_only_bug["conversation"][0])

<|im_start|>system
Set the scene by explaining the primary function of the code. Following that, dissect each part of the code and convey its contribution to the overall operation. Pay attention to the explanation of key elements like variables or functions, making sure their purpose is understood in the simplest of terms. Identify any common mistakes, bugs, or issues, provide insight into their root cause, and explain how to fix them. Reiterate to the user that facing challenges is part of the learning curve. Conclude by illustrating how the code's workings link to real-world applications.<|im_end|>
<|im_start|>user
Taking my first steps in Python, I am having trouble understanding what this code is trying to do. Can you explain it to me in simpler terms? What is the purpose of the 'map' variable and how is it being used? How is the 'complement' variable being used?

Here is my code: 
```python
def twoSum(nums, target):
    map = {}
    for i, num in enumerate(nums):
        complemen