In [None]:
from langchain_core.messages import SystemMessage
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import HumanMessagePromptTemplate
import subprocess
import os
import pandas as pd

# Setup the prompt templates
human_prompt = HumanMessagePromptTemplate.from_template("{request}")
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a highly skilled data scientist with 20 years of experience. You specialize in writing clean, efficient, and error-free ML code. Generate only the code snippets without any explanations or comments."),
    human_prompt
])
# Ensure reproducibility by setting a seed
# model = Ollama(model="llama3", temperature=0)
model = Ollama(model="llama3")

# Function to generate code with LLM


def generate_code(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_messages()
    response = model.invoke(formatted_request)
    generated_code = response
    return generated_code

# Function to clean and correct the code


def clean_and_correct_code(generated_code, csv_path):
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    corrected_code = cleaned_code.replace("{csv_path}", f"{csv_path}")
    return corrected_code

# Function to validate the code


def validate_code(code_filename):
    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

# Load dataset column names


def get_dataset_columns(csv_path):
    df = pd.read_csv(csv_path)
    return df.columns.tolist()

# Main function


def main():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/Iris.csv"
    columns = get_dataset_columns(csv_path)
    columns_info = ", ".join(columns)

    request = (
        f"Write a Python code for reading a CSV file, handling missing values, encoding categorical variables, "
        f"splitting the data into training and test sets, training a Decision Tree classifier, and evaluating the model's performance. "
        f"Use placeholders like {{csv_path}} for dynamic inputs. The dataset has the following columns: {columns_info}. Only return the code without any explanations."
    )

    success = False
    code_filename = "generated_code.py"

    while not success:
        generated_code = generate_code(request)
        corrected_code = clean_and_correct_code(
            generated_code, f"'{csv_path}'")

        with open(code_filename, "w") as file:
            file.write(corrected_code)

        success, output = validate_code(code_filename)

        if not success:
            print("Code validation failed with error:")
            print(output)
            # Update the request to include the error for the LLM to regenerate the code
            request = f"Fix the following code and its errors:\n{corrected_code}\nError:\n{output}"
        else:
            print("Code validated successfully. Output:")
            print(output)

    # Save the validated code to a .py file
    validated_code_filename = "validated_generated_code.py"
    with open(validated_code_filename, "w") as file:
        file.write(corrected_code)

    print(f"Validated code saved to {validated_code_filename}")


if __name__ == "__main__":
    main()

In [1]:
from langchain_core.messages import SystemMessage
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import HumanMessagePromptTemplate
import pandas as pd

# Setup the prompt templates
human_prompt = HumanMessagePromptTemplate.from_template("{request}")
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a highly skilled data scientist with 20 years of experience. You specialize in writing clean, efficient, and error-free ML code. Generate only the code snippets without any explanations or comments."),
    human_prompt
])
# Ensure reproducibility by setting a seed
model = Ollama(model="llama3")

# Function to generate code with LLM


def generate_code(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_messages()
    response = model.invoke(formatted_request)
    generated_code = response
    return generated_code

# Function to clean and correct the code


def clean_and_correct_code(generated_code, csv_path):
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    corrected_code = cleaned_code.replace("{csv_path}", f"{csv_path}")
    return corrected_code

# Load dataset column names


def get_dataset_columns(csv_path):
    df = pd.read_csv(csv_path)
    return df.columns.tolist()

# Main function for Part 1


def main_part1():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    columns = get_dataset_columns(csv_path)
    columns_info = ", ".join(columns)

    request = (
        f"Write a Python code for reading a CSV file, handling missing values, encoding categorical variables, "
        f"splitting the data into training and test sets, training a Decision Tree classifier, and evaluating the model's performance. "
        f"Use placeholders like {{csv_path}} for dynamic inputs. The dataset has the following columns: {columns_info}. Only return the code without any explanations."
    )

    generated_code = generate_code(request)
    corrected_code = clean_and_correct_code(generated_code, f"'{csv_path}'")

    code_filename = "generated_code.py"
    with open(code_filename, "w") as file:
        file.write(corrected_code)

    print(f"Generated code saved to {code_filename}")


if __name__ == "__main__":
    main_part1()

Generated code saved to generated_code.py


In [2]:
import subprocess

# Function to validate the code


def validate_code(code_filename):
    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

# Main function for Part 2


def main_part2():
    code_filename = "generated_code.py"
    success, output = validate_code(code_filename)

    if not success:
        print("Code validation failed with error:")
        print(output)
    else:
        print("Code validated successfully. Output:")
        print(output)

    return success, output


if __name__ == "__main__":
    success, output = main_part2()

Code validation failed with error:
Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/Latest_2/generated_code.py", line 24, in <module>
    clf.fit(X_train, y_train)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 1009, in fit
    super()._fit(
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 294, in _fit
    check_classification_targets(y)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 221, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.



In [3]:
# Main function for Part 3
def main_part3():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    with open("generated_code.py", "r") as file:
        initial_code = file.read()

    success, output = main_part2()

    if not success:
        request = f"Fix the following code and its errors:\n{initial_code}\nError:\n{output}"
        generated_code = generate_code(request)
        corrected_code = clean_and_correct_code(
            generated_code, f"'{csv_path}'")

        validated_code_filename = "validated_generated_code.py"
        with open(validated_code_filename, "w") as file:
            file.write(corrected_code)

        print(f"Validated code saved to {validated_code_filename}")
    else:
        print("No need to fix the code, it runs successfully.")


if __name__ == "__main__":
    main_part3()

Code validation failed with error:
Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/Latest_2/generated_code.py", line 24, in <module>
    clf.fit(X_train, y_train)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 1009, in fit
    super()._fit(
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 294, in _fit
    check_classification_targets(y)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 221, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

Validated code