In [2]:
from langchain_core.messages import SystemMessage
from langchain_community.llms import Ollama
from langchain.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import HumanMessagePromptTemplate
import pandas as pd
import subprocess
import os

# Define the workflow steps with assigned numbers
workflow_steps = {
    11: "Load the CSV file into a suitable format (e.g., DataFrame)",
    21: "Examine the structure and characteristics of the data",
    22: "Identify missing values, data types, and statistical summary",
    23: "Visualize the data using charts, graphs, or plots",
    24: "Gain insights and formulate hypotheses",
    31: "Handle missing values (remove or impute)",
    32: "Convert categorical variables to numerical representations",
    33: "Perform feature scaling or normalization",
    34: "Encode categorical variables (one-hot encoding, label encoding, etc.)",
    35: "Split the data into training and testing sets",
    41: "Create new features based on domain knowledge or data insights",
    42: "Combine or transform existing features",
    43: "Perform feature selection to identify relevant features",
    51: "Choose appropriate machine learning algorithms based on the problem type",
    52: "Define the model architecture and hyperparameters",
    53: "Train the selected model on the training data",
    54: "Utilize techniques like cross-validation for model evaluation",
    61: "Evaluate the trained model's performance on the testing data",
    62: "Calculate evaluation metrics (e.g., accuracy, precision, recall, F1-score)",
    63: "Visualize the model's performance using confusion matrix, ROC curve, etc.",
    64: "Fine-tune the model if necessary",
    71: "Analyze the model's coefficients or feature importances",
    72: "Visualize the model's decision boundaries or learned patterns",
    73: "Interpret the model's predictions and explain its behavior",
    81: "Generate unit code documentation during the code generation process",
    82: "Execute the combined code and capture relevant outputs and insights",
    83: "Create a comprehensive documentation for the entire workflow, including project overview, dataset details, selected steps, results, and interpretations",
    84: "Present the documentation to users for understanding and reference"
}

# Setup the prompt templates
human_prompt = HumanMessagePromptTemplate.from_template("{request}")
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a highly skilled data scientist with 20 years of experience. You specialize in writing clean, efficient, and error-free ML code. Generate only the code snippets without any explanations or comments."),
    human_prompt
])

model = Ollama(model="llama3")

# Function to generate code with LLM


def generate_code_snippet(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_messages()
    response = model.invoke(formatted_request)
    generated_code = response
    return generated_code

# Function to generate code with placeholder data


def generate_code_snippet_with_placeholder(request, placeholder_data):
    placeholder_request = request + \
        f"\nUse the following placeholder data:\n{placeholder_data}"
    formatted_request = chat_prompt.format_prompt(
        request=placeholder_request).to_messages()
    response = model.invoke(formatted_request)
    generated_code = response
    return generated_code

# Function to clean and correct the code


def clean_and_correct_code(generated_code, csv_path):
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    corrected_code = cleaned_code.replace("{csv_path}", f"{csv_path}")
    return corrected_code

# Load dataset information


def get_dataset_info(csv_path):
    df = pd.read_csv(csv_path)
    columns = df.columns.tolist()
    types = df.dtypes.to_dict()
    sample_data = df.head().to_dict(orient='list')
    value_counts = {col: df[col].value_counts().to_dict()
                    for col in df.columns}
    description = df.describe().to_dict()
    return columns, types, sample_data, value_counts, description

# Function to get selected workflow steps based on step numbers


def get_selected_steps(step_numbers):
    selected_steps = [workflow_steps[num]
                      for num in step_numbers if num in workflow_steps]
    return selected_steps

# Function to validate a single unit code snippet


def validate_unit_code(code_filename):
    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

# Function to generate documentation for a step
def generate_documentation(step, columns_info, types_info, sample_data_info, value_counts_info, description_info):
    request = (
        f"Provide a clear and concise description of the job performed by the code for the following step: {step}. "
        f"The description should summarize the main tasks and key points without going into the specifics of the code. "
        f"The dataset has the following columns: {columns_info}. "
        f"The data types are: {types_info}. Sample data: {sample_data_info}. Value counts: {value_counts_info}. "
        f"Description: {description_info}."
    )
    documentation = generate_code_snippet(request)
    return documentation

# Function to fix the code based on the error
def fix_code(code_snippet, error_message, csv_path):
    request = (
        f"The following code snippet encountered an error:\n\n{code_snippet}\n\n"
        f"Error message:\n{error_message}\n\n"
        f"Please fix the code snippet to resolve the error without providing any explanations or comments."
    )
    fixed_code = generate_code_snippet(request)
    cleaned_fixed_code = clean_and_correct_code(fixed_code, csv_path)
    return cleaned_fixed_code


# Main function
def main():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    columns, types, sample_data, value_counts, description = get_dataset_info(
        csv_path)

    columns_info = ", ".join(columns)
    types_info = ", ".join([f"{col}: {typ}" for col, typ in types.items()])
    sample_data_info = ", ".join(
        [f"{col}: {vals[:5]}" for col, vals in sample_data.items()])
    value_counts_info = ", ".join(
        [f"{col}: {dict(list(vc.items())[:5])}" for col, vc in value_counts.items()])
    description_info = ", ".join(
        [f"{col}: {desc}" for col, desc in description.items()])

    # Hardcode the selected step numbers (replace with dynamic selection later)
    selected_step_numbers = [ 11, 21, 22, 31, 32, 35, 51, 52, 53, 61, 62]

    # Get selected workflow steps based on step numbers
    selected_steps = get_selected_steps(selected_step_numbers)

    # Generate and validate code snippets for each selected step
    unit_code_filenames = []
    documentation_snippets = []
    combined_code = ""
    placeholder_data = ""
    for i, step in enumerate(selected_steps):
        request = (
            f"Write a Python code snippet for the following step: {step}. "
            f"Use placeholders like {csv_path} for dynamic inputs. The dataset has the following columns: {columns_info}. "
            f"The data types are: {types_info}. Sample data: {sample_data_info}. Value counts: {value_counts_info}. "
            f"Description: {description_info}. Only return the code without any explanations."
        )

        if i < len(selected_steps) - 1:
            if placeholder_data:
                code_snippet = generate_code_snippet_with_placeholder(
                    request, placeholder_data)
            else:
                code_snippet = generate_code_snippet(request)

            cleaned_code_snippet = clean_and_correct_code(
                code_snippet, csv_path)

            code_filename = f"step_{i+1}_code.py"
            with open(code_filename, "w") as file:
                file.write(cleaned_code_snippet)

            fixed_code_snippet = cleaned_code_snippet  # Initialize fixed_code_snippet
            success, output = validate_unit_code(code_filename)
            while not success:
                print(f"Validation failed for step: {step}")
                print(f"Error: {output}")
                fixed_code_snippet = fix_code(
                    cleaned_code_snippet, output, csv_path)
                with open(code_filename, "w") as file:
                    file.write(fixed_code_snippet)
                success, output = validate_unit_code(code_filename)

            placeholder_data += output + "\n"  # Append the output to placeholder data

            unit_code_filenames.append(code_filename)
            documentation_snippet = generate_documentation(
                step, columns_info, types_info, sample_data_info, value_counts_info, description_info)
            documentation_snippets.append(documentation_snippet)
            combined_code += fixed_code_snippet + "\n\n"
        # Combine code snippets and validate for the last step (model training and evaluation)
        else:
            cleaned_code_snippet = clean_and_correct_code(
                code_snippet, csv_path)
            fixed_combined_code = cleaned_code_snippet  # Initialize fixed_combined_code
            combined_code += cleaned_code_snippet + "\n\n"
            with open("combined_code.py", "w") as file:
                file.write(combined_code)

            success, output = validate_unit_code("combined_code.py")
            while not success:
                print(f"Validation failed for step: {step}")
                print(f"Error: {output}")
                fixed_combined_code = fix_code(combined_code, output, csv_path)
                with open("combined_code.py", "w") as file:
                    file.write(fixed_combined_code)
                success, output = validate_unit_code("combined_code.py")

            if success:
                unit_code_filenames.append(code_filename)
                documentation_snippet = generate_documentation(
                    step, columns_info, types_info, sample_data_info, value_counts_info, description_info)
                documentation_snippets.append(documentation_snippet)
            else:
                print(f"Validation failed for step: {step}")
                print(f"Error: {output}")
                break

    # Save documentation to a separate file if all steps are validated successfully
    if len(unit_code_filenames) == len(selected_steps):
        with open("documentation.txt", "w") as file:
            file.write("\n".join(documentation_snippets))
        print("Documentation saved to documentation.txt")
    else:
        print("Some code snippets failed validation. Documentation not generated.")


if __name__ == "__main__":
    main()

Validation failed for step: Handle missing values (remove or impute)
Error:   File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/Unit_code_generator/step_4_code.py", line 18
    Remember to adjust the file path and column names according to your specific dataset.
             ^^
SyntaxError: invalid syntax

Validation failed for step: Handle missing values (remove or impute)
Error: Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/Unit_code_generator/step_4_code.py", line 4, in <module>
    df = pd.read_csv('/Users/ilya/Desktop/GitHub_Repositories/Thesis/Unit_code_generator/step_4_code.py')
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 626, in _read
    return parser.read(nrows)
  File "/Users/ilya/miniconda3/envs/the

KeyboardInterrupt: 