In [1]:
import pandas as pd
import subprocess
import os
import requests
from dotenv import load_dotenv
from typing import Dict, List, Tuple, Any, Set
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from json import JSONDecodeError

# Load environment variables from .env file
load_dotenv()

# Define the workflow steps with assigned numbers
workflow_steps = {
    11: {
        "description": "Load the CSV file into a suitable format (e.g., DataFrame)",
        "dependencies": []
    },
    21: {
        "description": "Examine the structure and characteristics of the data",
        "dependencies": [11]
    },
    22: {
        "description": "Identify missing values, data types, and statistical summary",
        "dependencies": [11, 21]
    },
    23: {
        "description": "Visualize the data using charts, graphs, or plots",
        "dependencies": [11, 21, 22]
    },
    24: {
        "description": "Gain insights and formulate hypotheses",
        "dependencies": [11, 21, 22, 23]
    },
    31: {
        "description": "Handle missing values (remove or impute)",
        "dependencies": [11, 22]
    },
    32: {
        "description": "Convert categorical variables to numerical representations",
        "dependencies": [11, 22, 31]
    },
    33: {
        "description": "Perform feature scaling or normalization",
        "dependencies": [11, 31, 32]
    },
    34: {
        "description": "Encode categorical variables (one-hot encoding, label encoding, etc.)",
        "dependencies": [11, 32]
    },
    35: {
        "description": "Split the preprocessed data into training and testing sets. Ensure that all previous data preprocessing steps (handling missing values, encoding categorical variables, and scaling) have been completed before this step.",
        "dependencies": [11, 31, 32, 33, 34]
    },
    41: {
        "description": "Create new features based on domain knowledge or data insights",
        "dependencies": [11, 24, 35]
    },
    42: {
        "description": "Combine or transform existing features",
        "dependencies": [11, 35, 41]
    },
    43: {
        "description": "Perform feature selection to identify relevant features",
        "dependencies": [11, 35, 41, 42]
    },
    51: {
        "description": "Analyze the problem type and dataset characteristics, then select and implement the single most appropriate machine learning algorithm (choose from scikit-learn, XGBoost, LightGBM, or CatBoost). Justify your choice based on the data properties and problem requirements.",
        "dependencies": [35, 43]
    },
    52: {
        "description": "Define the model architecture and hyperparameters",
        "dependencies": [51]
    },
    53: {
        "description": "Train the selected model on the training data",
        "dependencies": [35, 51, 52]
    },
    54: {
        "description": "Utilize techniques like cross-validation for model evaluation",
        "dependencies": [35, 51, 52, 53]
    },
    61: {
        "description": "Evaluate the trained model's performance on the testing data",
        "dependencies": [35, 53, 54]
    },
    62: {
        "description": "Calculate evaluation metrics (e.g., accuracy, precision, recall, F1-score)",
        "dependencies": [61]
    },
    63: {
        "description": "Visualize the model's performance using confusion matrix, ROC curve, etc.",
        "dependencies": [61, 62]
    },
    64: {
        "description": "Fine-tune the model if necessary",
        "dependencies": [61, 62, 63]
    },
    71: {
        "description": "Analyze the model's coefficients or feature importances",
        "dependencies": [53, 61]
    },
    72: {
        "description": "Visualize the model's decision boundaries or learned patterns",
        "dependencies": [53, 61, 71]
    },
    73: {
        "description": "Interpret the model's predictions and explain its behavior",
        "dependencies": [53, 61, 71, 72]
    },
    81: {
        "description": "Generate unit code documentation during the code generation process",
        "dependencies": [11, 21, 22, 23, 24, 31, 32, 33, 34, 35, 41, 42, 43, 51, 52, 53, 54, 61, 62, 63, 64, 71, 72, 73]
    },
    82: {
        "description": "Execute the combined code and capture relevant outputs and insights",
        "dependencies": [11, 21, 22, 23, 24, 31, 32, 33, 34, 35, 41, 42, 43, 51, 52, 53, 54, 61, 62, 63, 64, 71, 72, 73]
    },
    83: {
        "description": "Create a comprehensive documentation for the entire workflow, including project overview, dataset details, selected steps, results, and interpretations",
        "dependencies": [81, 82]
    },
    84: {
        "description": "Present the documentation to users for understanding and reference",
        "dependencies": [83]
    }
}

# OpenAI API configuration
api_url = "https://openrouter.ai/api/v1"
api_key = os.getenv('OPENROUTER_API_KEY')


def openai_chat(request):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "deepseek/deepseek-chat",
        "messages": [{"role": "user", "content": request}]
    }
    try:
        response = requests.post(f"{api_url}/chat/completions", headers=headers, json=data)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.json()["choices"][0]["message"]["content"]
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None
    except (KeyError, IndexError, JSONDecodeError) as e:
        print(f"Error parsing API response: {e}")
        return None

In [2]:
def generate_code_snippet(request: str) -> str:
    response = openai_chat(request)
    return response

def clean_and_correct_code(generated_code: str, csv_path: str) -> str:
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    corrected_code = cleaned_code.replace("{csv_path}", f"{csv_path}")
    return corrected_code

def get_dataset_info(csv_path: str) -> Tuple[List[str], Dict[str, Any], Dict[str, List[Any]], Dict[str, Dict[Any, int]], Dict[str, Dict[str, float]]]:
    df = pd.read_csv(csv_path)
    columns = df.columns.tolist()
    types = df.dtypes.to_dict()
    sample_data = df.head().to_dict(orient='list')
    value_counts = {col: df[col].value_counts().to_dict()
                    for col in df.columns}
    description = df.describe().to_dict()
    return columns, types, sample_data, value_counts, description

def validate_unit_code(code_filename: str) -> Tuple[bool, str]:
    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

def generate_documentation(step: int, columns_info: str, types_info: str, sample_data_info: str, value_counts_info: str, description_info: str) -> str:
    request = (
        f"Provide a clear and concise description of the job performed by the code for the following step: {workflow_steps[step]['description']}. "
        f"The description should summarize the main tasks and key points without going into the specifics of the code. "
        f"The dataset has the following columns: {columns_info}. "
        f"The data types are: {types_info}. Sample data: {sample_data_info}. Value counts: {value_counts_info}. "
        f"Description: {description_info}."
    )
    documentation = generate_code_snippet(request)
    return documentation

def fix_code(code_snippet: str, error_message: str, csv_path: str, df: pd.DataFrame = None) -> Tuple[str, pd.DataFrame]:
    request = (
        f"The following code snippet encountered an error:\n\n{code_snippet}\n\n"
        f"Error message:\n{error_message}\n\n"
        f"Please fix the code snippet to resolve the error without providing any explanations or comments."
    )
    fixed_code = generate_code_snippet(request)
    if fixed_code is None:
        print("Failed to generate fixed code. Returning original code.")
        return code_snippet, df
    
    cleaned_fixed_code = clean_and_correct_code(fixed_code, csv_path)
    
    # Execute the fixed code snippet to get the updated dataframe
    if df is not None:
        local_vars = {"df": df.copy(), "pd": pd, "np": np, "LabelEncoder": LabelEncoder, "StandardScaler": StandardScaler}
        try:
            exec(cleaned_fixed_code, globals(), local_vars)
            df = local_vars["df"]
        except Exception as e:
            print(f"Error executing fixed code: {e}")
    
    return cleaned_fixed_code, df

def get_all_prerequisites(step: int, workflow_steps: Dict[int, Dict[str, Any]]) -> Set[int]:
    prerequisites = set()
    for prereq in workflow_steps[step]['dependencies']:
        prerequisites.add(prereq)
        prerequisites.update(get_all_prerequisites(prereq, workflow_steps))
    return prerequisites

def generate_code_for_step(step: int, workflow_steps: Dict[int, Dict[str, Any]], generated_code: Dict[int, str], csv_path: str, df: pd.DataFrame = None) -> Tuple[str, pd.DataFrame]:
    prerequisites = get_all_prerequisites(step, workflow_steps)
    full_code = ""
    
    if df is None:
        full_code += f"# Load the dataset\ndf = pd.read_csv('{csv_path}')\n\n"
    else:
        full_code += "# Use the preprocessed dataframe from previous steps\n\n"
    
    # Add code from prerequisites
    for prereq in sorted(prerequisites):
        if prereq not in generated_code:
            generated_code[prereq], df = generate_code_snippet_for_step(prereq, workflow_steps, csv_path, df)
        
        prereq_code = generated_code[prereq]
        full_code += f"# Code from step {prereq}: {workflow_steps[prereq]['description']}\n{prereq_code}\n\n"
    
    current_step_code, df = generate_code_snippet_for_step(step, workflow_steps, csv_path, df)
    full_code += f"# Code for current step {step}: {workflow_steps[step]['description']}\n{current_step_code}"
    
    return full_code, df

def generate_code_snippet_for_step(step: int, workflow_steps: Dict[int, Dict[str, Any]], csv_path: str, df: pd.DataFrame = None) -> Tuple[str, pd.DataFrame]:
    request = (
        f"Write a Python code snippet for the following step: {workflow_steps[step]['description']}. "
        f"Assume that the dataset has already been loaded into a DataFrame named 'df'. "
        f"Do not include any import statements or code to load the dataset. "
        f"Only return the code specific to this step without any additional explanations or comments. "
        f"If this step involves data preprocessing, make sure to return the updated dataframe."
    )
    code_snippet = generate_code_snippet(request)
    cleaned_code_snippet = clean_and_correct_code(code_snippet, csv_path)
    
    # Execute the code snippet to get the updated dataframe
    if df is not None:
        local_vars = {"df": df.copy(), "pd": pd, "np": np, "LabelEncoder": LabelEncoder, "StandardScaler": StandardScaler}
        exec(cleaned_code_snippet, globals(), local_vars)
        df = local_vars["df"]
    
    return cleaned_code_snippet, df

def main():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    columns, types, sample_data, value_counts, description = get_dataset_info(csv_path)

    columns_info = ", ".join(columns)
    types_info = ", ".join([f"{col}: {typ}" for col, typ in types.items()])
    sample_data_info = ", ".join([f"{col}: {vals[:5]}" for col, vals in sample_data.items()])
    value_counts_info = ", ".join([f"{col}: {dict(list(vc.items())[:5])}" for col, vc in value_counts.items()])
    description_info = ", ".join([f"{col}: {desc}" for col, desc in description.items()])

    selected_step_numbers = [11, 21, 22, 31, 32, 35, 51, 52, 53, 61, 62]
    generated_code = {}
    documentation_snippets = []
    df = None

    for step in selected_step_numbers:
        full_code, df = generate_code_for_step(step, workflow_steps, generated_code, csv_path, df)
        
        if full_code is None:
            print(f"Failed to generate code for step {step}. Skipping.")
            continue
        
        code_filename = f"step_{step}_code.py"
        with open(code_filename, "w") as file:
            file.write(full_code)
        
        success, output = validate_unit_code(code_filename)
        retry_count = 0
        max_retries = 5
        while not success and retry_count < max_retries:
            print(f"Validation failed for step {step}: {output}")
            fixed_code, df = fix_code(full_code, output, csv_path, df)
            with open(code_filename, "w") as file:
                file.write(fixed_code)
            success, output = validate_unit_code(code_filename)
            retry_count += 1
        
        if not success:
            print(f"Failed to fix code for step {step} after {max_retries} attempts. Skipping.")
            continue
        
        generated_code[step] = full_code
        documentation_snippet = generate_documentation(
            step, columns_info, types_info, sample_data_info, value_counts_info, description_info)
        documentation_snippets.append(documentation_snippet)

    # Combine all code snippets
    combined_code = "\n\n".join([generated_code[step] for step in selected_step_numbers])

    # Save the combined code to a file
    with open("combined_code.py", "w") as file:
        file.write(combined_code)

    # Validate the combined code
    success, output = validate_unit_code("combined_code.py")
    if success:
        print("Combined code validated successfully.")
    else:
        print(f"Validation failed for combined code.")
        print(f"Error: {output}")

    # Save documentation to a separate file
    with open("documentation.txt", "w") as file:
        file.write("\n\n".join(documentation_snippets))
    print("Documentation saved to documentation.txt")

if __name__ == "__main__":
    main()

Validation failed for step 11: Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/API_deepseek-chat copy/step_11_code.py", line 2, in <module>
    df = pd.read_csv('/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv')
NameError: name 'pd' is not defined. Did you mean: 'id'?

Validation failed for step 21: Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/API_deepseek-chat copy/step_21_code.py", line 2, in <module>
    df = pd.read_csv('/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv')
NameError: name 'pd' is not defined. Did you mean: 'id'?

Validation failed for step 21: Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/API_deepseek-chat copy/step_21_code.py", line 31, in <module>
    df = df.fillna(df.mean())
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/pandas/co

KeyboardInterrupt: 