In [1]:
import pandas as pd
import subprocess
import os
import requests
from dotenv import load_dotenv
from typing import Dict, List, Tuple, Any, Set
import json

# Load environment variables from .env file
load_dotenv()

# Define the workflow steps with assigned numbers and dependencies
workflow_steps = {
    11: {
        "description": "Load the CSV file as pandas DataFrame",
        "dependencies": []
    },
    21: {
        "description": "Examine the structure and characteristics of the data",
        "dependencies": [11]
    },
    22: {
        "description": "Identify missing values, data types, and statistical summary",
        "dependencies": [11, 21]
    },
    31: {
        "description": "Handle missing values (remove or impute) if there are so",
        "dependencies": [11, 22]
    },
    32: {
        "description": "Identify if there is a need to convert categorical variables to numerical representations. If yes, then convert them.",
        "dependencies": [11, 22, 31]
    },
    35: {
        "description": "Split the preprocessed data into training and testing sets",
        "dependencies": [11, 31, 32]
    },
    51: {
        "description": "Implement a single most appropriate machine learning algorithm for the dataset (choose from scikit-learn, XGBoost, LightGBM, or CatBoost).",
        "dependencies": [35]
    },
    52: {
        "description": "Fine-tune the model if necessary",
        "dependencies": [51]
    },
    53: {
        "description": "Train the selected model on the training data and evaluate its performance on the training data",
        "dependencies": [35, 51, 52]
    },
    61: {
        "description": "Evaluate the trained model's performance on the testing data",
        "dependencies": [35, 53]
    },
    62: {
        "description": "Calculate evaluation metrics (e.g., accuracy, precision, recall, F1-score)",
        "dependencies": [61]
    }
}

# OpenAI API configuration
api_url = "https://openrouter.ai/api/v1"
api_key = os.getenv('OPENROUTER_API_KEY')

def openai_chat(request):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "meta-llama/llama-3-70b-instruct",
        "messages": [{"role": "user", "content": request}]
    }
    try:
        response = requests.post(f"{api_url}/chat/completions", headers=headers, json=data)
        response.raise_for_status()  # This will raise an exception for HTTP errors
        result = response.json()
        if 'choices' not in result or not result['choices']:
            raise KeyError("No 'choices' in the API response")
        return result["choices"][0]["message"]["content"]
    except requests.RequestException as e:
        print(f"API request failed: {e}")
        print(f"Response content: {response.text if 'response' in locals() else 'No response'}")
        raise
    except (KeyError, IndexError) as e:
        print(f"Unexpected API response format: {e}")
        print(f"Response content: {response.text if 'response' in locals() else 'No response'}")
        raise

def generate_code_snippet(request: str) -> str:
    return openai_chat(request)

def clean_and_correct_code(generated_code: str, csv_path: str) -> str:
    cleaned_code = generated_code.replace("```python", "").replace("```", "").strip()
    cleaned_code_lines = [line for line in cleaned_code.split("\n") if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    corrected_code = cleaned_code.replace("{csv_path}", f"'{csv_path}'")
    return corrected_code

def get_dataset_info(csv_path: str) -> Tuple[List[str], Dict[str, Any], Dict[str, List[Any]], Dict[str, Dict[Any, int]], Dict[str, Dict[str, float]]]:
    df = pd.read_csv(csv_path)
    columns = df.columns.tolist()
    types = df.dtypes.to_dict()
    sample_data = df.head().to_dict(orient='list')
    value_counts = {col: df[col].value_counts().to_dict() for col in df.columns}
    description = df.describe().to_dict()
    return columns, types, sample_data, value_counts, description

def save_dataset_info(csv_path: str, info_file_path: str):
    columns, types, sample_data, value_counts, description = get_dataset_info(csv_path)
    
    # Convert types to strings as they're not JSON serializable
    types = {k: str(v) for k, v in types.items()}
    
    # Limit the amount of data
    limited_sample_data = {k: v[:5] for k, v in sample_data.items()}
    limited_value_counts = {k: dict(list(v.items())[:5]) for k, v in value_counts.items()}
    limited_description = {k: {sk: sv for sk, sv in v.items() if sk in ['count', 'mean', 'std', 'min', 'max']} for k, v in description.items()}
    
    dataset_info = {
        'columns': columns,
        'types': types,
        'sample_data': limited_sample_data,
        'value_counts': limited_value_counts,
        'description': limited_description
    }
    
    with open(info_file_path, 'w') as f:
        json.dump(dataset_info, f)

def validate_unit_code(code_filename: str) -> Tuple[bool, str]:
    try:
        result = subprocess.run(["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

def generate_documentation(step: int, dataset_info: Dict[str, Any]) -> str:
    request = (
        f"Provide a clear and concise description of the job performed by the code for the following step: {workflow_steps[step]['description']}. "
        f"The description should summarize the main tasks and key points without going into the specifics of the code. "
        f"The dataset has the following columns: {dataset_info['columns']}. "
        f"The data types are: {dataset_info['types']}. "
        f"Sample data: {dataset_info['sample_data']}. "
        f"Value counts: {dataset_info['value_counts']}. "
        f"Description: {dataset_info['description']}."
    )
    return generate_code_snippet(request)

def fix_code(code_snippet: str, error_message: str, csv_path: str) -> str:
    request = (
        f"The following code snippet encountered an error:\n\n{code_snippet}\n\n"
        f"Error message:\n{error_message}\n\n"
        f"Please fix the code snippet to resolve the error without providing any explanations or comments."
    )
    fixed_code = generate_code_snippet(request)
    return clean_and_correct_code(fixed_code, csv_path)

def get_all_prerequisites(step: int, workflow_steps: Dict[int, Dict[str, Any]]) -> Set[int]:
    prerequisites = set()
    for prereq in workflow_steps[step]['dependencies']:
        prerequisites.add(prereq)
        prerequisites.update(get_all_prerequisites(prereq, workflow_steps))
    return prerequisites

def generate_code_for_step(step: int, workflow_steps: Dict[int, Dict[str, Any]], generated_code: Dict[int, str], csv_path: str, dataset_info: Dict[str, Any]) -> str:
    prerequisites = get_all_prerequisites(step, workflow_steps)
    
    full_code = f"import pandas as pd\n\n"
    full_code += f"# Load the dataset\ndf = pd.read_csv('{csv_path}')\n\n"
    
    for prereq in sorted(prerequisites):
        if prereq not in generated_code:
            generated_code[prereq] = generate_code_snippet_for_step(prereq, workflow_steps, csv_path, dataset_info)
        
        full_code += f"def step_{prereq}(df):\n"
        for line in generated_code[prereq].split('\n'):
            full_code += f"    {line}\n"
        full_code += "    return df\n\n"
    
    full_code += f"def step_{step}(df):\n"
    current_step_code = generate_code_snippet_for_step(step, workflow_steps, csv_path, dataset_info)
    for line in current_step_code.split('\n'):
        full_code += f"    {line}\n"
    full_code += "    return df\n\n"
    
    full_code += "# Execute steps\n"
    for s in sorted(list(prerequisites) + [step]):  # Convert prerequisites to list before adding
        full_code += f"df = step_{s}(df)\n"
    
    return full_code

def generate_code_snippet_for_step(step: int, workflow_steps: Dict[int, Dict[str, Any]], csv_path: str, dataset_info: Dict[str, Any]) -> str:
    request = (
        f"Write a Python code snippet for the following step: {workflow_steps[step]['description']}. "
        f"The dataset has the following columns: {dataset_info['columns']}. "
        f"The data types are: {dataset_info['types']}. "
        f"Here's a sample of the data: {dataset_info['sample_data']}. "
        f"Value counts (top 5): {dataset_info['value_counts']}. "
        f"Statistical description: {dataset_info['description']}. "
        f"Assume that the dataset has already been loaded into a DataFrame named 'df'. "
        f"Do not include import statements or code to load the dataset. "
        f"Only return the code specific to this step without any additional explanations. "
        f"Use the actual column names from the dataset in your code."
    )
    code_snippet = generate_code_snippet(request)
    return clean_and_correct_code(code_snippet, csv_path)

def main():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    info_file_path = "dataset_info.json"
    
    if not os.path.exists(info_file_path):
        save_dataset_info(csv_path, info_file_path)
    
    with open(info_file_path, 'r') as f:
        dataset_info = json.load(f)
    
    selected_step_numbers = [11, 21, 22, 31, 32, 35, 51, 52, 53, 61, 62]
    generated_code = {}
    documentation_snippets = []

    for step in selected_step_numbers:
        try:
            full_code = generate_code_for_step(step, workflow_steps, generated_code, csv_path, dataset_info)
            
            code_filename = f"step_{step}_code.py"
            with open(code_filename, "w") as file:
                file.write(full_code)
            
            success, output = validate_unit_code(code_filename)
            while not success:
                print(f"Validation failed for step {step}: {output}")
                fixed_code = fix_code(full_code, output, csv_path)
                with open(code_filename, "w") as file:
                    file.write(fixed_code)
                success, output = validate_unit_code(code_filename)
            
            generated_code[step] = full_code
            documentation_snippet = generate_documentation(step, dataset_info)
            documentation_snippets.append(documentation_snippet)
        except Exception as e:
            print(f"Error processing step {step}: {e}")
            # Don't break the loop, continue with the next step
            continue

    # Combine all successfully generated code snippets
    combined_code = "\n\n".join([generated_code[step] for step in selected_step_numbers if step in generated_code])

    # Save the combined code to a file
    with open("combined_code.py", "w") as file:
        file.write(combined_code)

    # Validate the combined code
    success, output = validate_unit_code("combined_code.py")
    if success:
        print("Combined code validated successfully.")
    else:
        print(f"Validation failed for combined code.")
        print(f"Error: {output}")

    # Save documentation to a separate file
    with open("documentation.txt", "w") as file:
        file.write("\n\n".join(documentation_snippets))
    print("Documentation saved to documentation.txt")

if __name__ == "__main__":
    main()

Validation failed for step 11: Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/API_llama-3-70b-instruct copy 2/step_11_code.py", line 19, in <module>
    df = step_11(df)
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/API_llama-3-70b-instruct copy 2/step_11_code.py", line 7, in step_11
    df = pd.DataFrame({
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/pandas/core/frame.py", line 693, in __init__
    dtype = self._validate_dtype(dtype)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/pandas/core/generic.py", line 515, in _validate_dtype
    dtype = pandas_dtype(dtype)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/pandas/core/dtypes/common.py", line 1645, in pandas_dtype
    npdtype = np.dtype(dtype)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/numpy/core/_internal.py", line 62, in _usefields
    names, formats, offsets, titles = _make

KeyboardInterrupt: 