In [1]:
from langchain_core.messages import SystemMessage
from langchain.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import HumanMessagePromptTemplate
import pandas as pd
import os

# Setup the prompt templates
human_prompt = HumanMessagePromptTemplate.from_template("{request}")
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a highly skilled data scientist with 20 years of experience. You specialize in writing clean, efficient, and error-free ML code. Generate only the code snippets without any explanations or comments."),
    human_prompt
])
# Ensure reproducibility by setting a seed
api_key = os.environ['OPENAI_API_KEY']
model = ChatOpenAI(openai_api_key=api_key)

# Function to generate code with LLM


def generate_code(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_messages()
    response = model.invoke(formatted_request)
    generated_code = response.content  # Extracting text content from the response
    return generated_code

# Function to clean and correct the code


def clean_and_correct_code(generated_code, csv_path):
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    corrected_code = cleaned_code.replace("{csv_path}", f"{csv_path}")
    return corrected_code

# Load dataset column names


def get_dataset_info(csv_path):
    df = pd.read_csv(csv_path)
    columns = df.columns.tolist()
    types = df.dtypes.to_dict()
    sample_data = df.head().to_dict(orient='list')
    description = df.describe().to_dict()
    return columns, types, sample_data, description


def main_part1():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    columns, types, sample_data, description = get_dataset_info(csv_path)

    columns_info = ", ".join(columns)
    types_info = "; ".join([f"{col}: {typ}" for col, typ in types.items()])
    sample_data_info = "; ".join(
        [f"{col}: {vals[:5]}" for col, vals in sample_data.items()])
    description_info = "; ".join(
        [f"{col}: {desc}" for col, desc in description.items()])

    request = (
        f"Write a Python code for reading a CSV file, handling missing values, encoding categorical variables, "
        f"splitting the data into training and test sets, training an appropriate model, and evaluating the model's performance. "
        f"Use placeholders like {csv_path} for dynamic inputs. The dataset has the following columns: {columns_info}. "
        f"Data types: {types_info}. Sample data (first 5 rows): {sample_data_info}. Description: {description_info}. "
        f"Only return the code without any explanations."
    )

    generated_code = generate_code(request)
    corrected_code = clean_and_correct_code(generated_code, f"{csv_path}")

    code_filename = "generated_code.py"
    with open(code_filename, "w") as file:
        file.write(corrected_code)

    print(f"Generated code saved to {code_filename}")


if __name__ == "__main__":
    main_part1()

  warn_deprecated(


Generated code saved to generated_code.py


In [2]:
import subprocess

# Function to validate the code


def validate_code(code_filename):
    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

# Main function for Part 2


def main_part2():
    code_filename = "generated_code.py"
    success, output = validate_code(code_filename)

    if not success:
        print("Code validation failed with error:")
        print(output)
    else:
        print("Code validated successfully. Output:")
        print(output)

    return success, output


if __name__ == "__main__":
    success, output = main_part2()

Code validated successfully. Output:
Mean Squared Error: 20624113.046452865



In [3]:
# Main function for Part 3
def main_part3():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    columns, types, sample_data, description = get_dataset_info(csv_path)

    columns_info = ", ".join(columns)
    types_info = "; ".join([f"{col}: {typ}" for col, typ in types.items()])
    sample_data_info = "; ".join(
        [f"{col}: {vals[:5]}" for col, vals in sample_data.items()])
    description_info = "; ".join(
        [f"{col}: {desc}" for col, desc in description.items()])

    with open("generated_code.py", "r") as file:
        initial_code = file.read()

    success, output = main_part2()

    if not success:
        request = (
            f"Fix the following code and its errors:\n{initial_code}\nError:\n{output}. "
            f"The dataset has the following columns: {columns_info}. "
            f"Data types: {types_info}. Sample data (first 5 rows): {sample_data_info}. Description: {description_info}."
        )
        generated_code = generate_code(request)
        corrected_code = clean_and_correct_code(
            generated_code, f"'{csv_path}'")

        validated_code_filename = "validated_generated_code.py"
        with open(validated_code_filename, "w") as file:
            file.write(corrected_code)

        print(f"Validated code saved to {validated_code_filename}")
    else:
        print("No need to fix the code, it runs successfully.")


if __name__ == "__main__":
    main_part3()

Code validated successfully. Output:
Mean Squared Error: 21108360.68260111

No need to fix the code, it runs successfully.
