### Step 1: Define the Graph with Inputs and Outputs

In [1]:
import subprocess
import os
import json
import requests
import pandas as pd
from dotenv import load_dotenv
from loguru import logger
import numpy as np
from sklearn.decomposition import PCA

# Configure logger
logger.add("execution.log", rotation="500 MB")

# Load environment variables from .env file
load_dotenv()

api_url = "https://openrouter.ai/api/v1"
api_key = os.getenv('OPENROUTER_API_KEY')

# Predefined parameters
raw_data = pd.read_csv(
    "/Users/ilya/Desktop/GitHub_Repositories/Thesis/datasets/complicated_case/learning-file_2.csv")
# Assume raw_data is a pandas DataFrame with 'timestamp' and 'signal' columns
signal_data = raw_data['signal'].values

# Adjust based on data size
SizeSegment = min(512, len(signal_data) // 100)
gamma = 'scale'  # Let sklearn choose an appropriate scale
nu = 0.1  # This might need domain knowledge to set appropriately
kernel = "rbf"  # This is often a good default

# PCA
# We'll use the signal data for PCA parameter calculation
pca = PCA().fit(signal_data.reshape(-1, 1))
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
NC_pca = np.argmax(cumulative_variance_ratio >= 0.95) + 1

Dec_levels = int(np.log2(SizeSegment)) - 3  # Adjust based on segment size


workflow_steps = {
    10: {
        "description": "Import raw data from CSV and segment it",
        "dependencies": [],
        "input": ["csv_path", "SizeSegment"],
        "output": ["Segments"],
        "additional_info": "Use pandas to read the CSV and create segments of size SizeSegment."
    },
    20: {
        "description": "Normalize the segmented data using MinMaxScaler",
        "dependencies": [10],
        "input": ["Segments"],
        "output": ["Segments_normalized"],
        "additional_info": "Segments is a list of 1D numpy arrays. Each segment should be normalized independently."
    },
    30: {
        "description": "Extract features using wavelet decomposition",
        "dependencies": [20],
        "input": ["Segments_normalized", "Dec_levels"],
        "output": ["Features"],
        "additional_info": "Use pywavelets (pywt) library with 'db3' wavelet and specified Dec_levels."
    },
    40: {
        "description": "Apply PCA for dimension reduction",
        "dependencies": [30],
        "input": ["Features", "NC_pca"],
        "output": ["PCA_Features", "pca"],
        "additional_info": "Use sklearn's PCA. Return both the transformed features and the PCA object."
    },
    50: {
        "description": "Train model, evaluate, and calculate metrics",
        "dependencies": [40],
        "input": ["PCA_Features", "kernel", "nu", "gamma"],
        "output": ["FittedClassifier", "Prec_learn", "Prec_test"],
        "additional_info": """
        1. Create labels: np.ones for learning data.
        2. Split data into train and test sets (80% train, 20% test).
        3. Create and fit a One-Class SVM classifier using sklearn.
        4. Predict labels for training data.
        5. Calculate error rate for training data.
        6. Predict labels for test data (assume all test data as anomaly, i.e., -1).
        7. Calculate error rate for test data.
        8. Calculate precision as 1 - error_rate for both training and test.
        Return the fitted classifier and both precision values.
        """
    }
}

# Example step and validation scripts to guide the model
example_step_script = """
import pandas as pd
import pywt
from sklearn.preprocessing import StandardScaler

def step_40(Segments_normalized, Dec_levels):
    Features = []
    for segment in Segments_normalized:
        coeffs = pywt.wavedec(segment, 'db4', level=Dec_levels)
        features = [coefficient.mean() for coefficient in coeffs]
        Features.append(features)
    return StandardScaler().fit_transform(Features)
"""

example_validation_script = """
import pandas as pd
from step_10 import step_10
from step_20 import step_20
from step_30 import step_30
from step_40 import step_40

def validate_step():
    csv_path = '/path/to/your/csv/file.csv'
    raw_data = step_10(csv_path)
    Segments = step_20(raw_data, SizeSegment=512)
    Segments_normalized = step_30(Segments)
    Features = step_40(Segments_normalized, Dec_levels=5)
    print(Features)

if __name__ == '__main__':
    validate_step()
"""

def openai_chat(request):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "meta-llama/llama-3-70b-instruct",
        "messages": [{"role": "user", "content": request}]
    }
    response = requests.post(f"{api_url}/chat/completions", headers=headers, json=data)
    
    # Log the full response for debugging
    response_json = response.json()
    logger.info(f"Full API response: {response_json}")
    
    # Check if 'choices' key exists in the response
    if "choices" in response_json and response_json["choices"]:
        return response_json["choices"][0]["message"]["content"]
    else:
        logger.error(f"Request: {data}")
        logger.error(f"Response: {response_json}")
        raise ValueError("The response does not contain 'choices'. Full response: " + str(response_json))

def generate_code_snippet(request):
    response = openai_chat(request)
    return response

def clean_and_correct_code(generated_code, csv_path):
    cleaned_code = generated_code.replace("```python", "").replace("```", "").strip()
    cleaned_code_lines = [line for line in cleaned_code.split("\n") if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    corrected_code = cleaned_code.replace("{csv_path}", f"'{csv_path}'")
    return corrected_code

def validate_unit_code(code_filename):
    try:
        result = subprocess.run(["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

def fix_code(code_snippet, error_message, csv_path):
    request = (
        f"The following code snippet encountered an error:\n\n{code_snippet}\n\n"
        f"Error message:\n{error_message}\n\n"
        f"Please fix the code snippet to resolve the error without providing any explanations or comments."
    )
    fixed_code = generate_code_snippet(request)
    return clean_and_correct_code(fixed_code, csv_path)

def get_all_dependencies(step, workflow_steps):
    dependencies = set(workflow_steps[step]["dependencies"])
    for dep in workflow_steps[step]["dependencies"]:
        dependencies.update(get_all_dependencies(dep, workflow_steps))
    return dependencies


def generate_code_for_step(step, workflow_steps, csv_path, dataset_info):
    additional_info = workflow_steps[step].get("additional_info", "")
    request = (
        f"Here is an example of a good step script:\n\n{example_step_script}\n\n"
        f"Write a Python function named 'step_{step}' for the following step: {workflow_steps[step]['description']}. "
        f"The function should take {', '.join(workflow_steps[step]['input'])} as input and return {', '.join(workflow_steps[step]['output'])}. "
        f"Ensure to include necessary imports and handle edge cases. "
        f"Additional information: {additional_info}\n"
        f"The dataset has the following columns: {dataset_info['columns']}. "
        f"The data types are: {dataset_info['types']}. "
        f"Here's a sample of the data: {dataset_info['sample_data']}. "
        f"Value counts (top 5): {dataset_info['value_counts']}. "
        f"Statistical description: {dataset_info['description']}. "
        f"Use these predefined parameters if needed: SizeSegment={SizeSegment}, gamma={gamma}, nu={nu}, kernel='{kernel}', NC_pca={NC_pca}, Dec_levels={Dec_levels}. "
        f"The input 'Segments' is a list of 1D numpy arrays, each representing a segment of the signal data. "
        f"Each segment should be normalized independently using sklearn's MinMaxScaler. "
        f"The output 'Segments_normalized' should be a list of normalized 1D numpy arrays. "
        f"Only return the function definition without any additional code or explanations."
    )
    code_snippet = generate_code_snippet(request)
    return clean_and_correct_code(code_snippet, csv_path)

def generate_validation_file(step, workflow_steps):
    dependencies = get_all_dependencies(step, workflow_steps)
    input_params = workflow_steps[step]["input"]
    output_params = workflow_steps[step]["output"]

    validation_code = "import pandas as pd\n"
    for dep in sorted(dependencies):
        validation_code += f"from step_{dep} import step_{dep}\n"
    validation_code += f"from step_{step} import step_{step}\n\n"
    
    # Add predefined parameters
    validation_code += f"SizeSegment = {SizeSegment}\n"
    validation_code += f"gamma = '{gamma}'\n"
    validation_code += f"nu = {nu}\n"
    validation_code += f"kernel = '{kernel}'\n"
    validation_code += f"NC_pca = {NC_pca}\n"
    validation_code += f"Dec_levels = {Dec_levels}\n\n"

    validation_code += "def validate_step():\n"
    validation_code += "    csv_path = '/Users/ilya/Desktop/GitHub_Repositories/Thesis/datasets/complicated_case/learning-file_2.csv'\n"

    for dep in sorted(dependencies):
        dep_inputs = ", ".join(workflow_steps[dep]["input"])
        dep_outputs = ", ".join(workflow_steps[dep]["output"])
        validation_code += f"    {dep_outputs} = step_{dep}({dep_inputs})\n"

    input_values = ", ".join(input_params)
    output_values = ", ".join(output_params)
    validation_code += f"    {output_values} = step_{step}({input_values})\n"
    validation_code += f"    print({output_values})\n"

    validation_code += "\nif __name__ == '__main__':\n"
    validation_code += "    validate_step()\n"

    with open(f"validate_step_{step}.py", "w") as file:
        file.write(validation_code)

def save_dataset_info(csv_path, info_file_path):
    df = pd.read_csv(csv_path)
    columns = df.columns.tolist()
    types = df.dtypes.apply(lambda x: str(x)).to_dict()
    sample_data = df.head().to_dict(orient='list')
    value_counts = {col: df[col].value_counts().head().to_dict() for col in df.columns}
    description = df.describe().to_dict()

    dataset_info = {
        'columns': columns,
        'types': types,
        'sample_data': sample_data,
        'value_counts': value_counts,
        'description': description
    }

    with open(info_file_path, 'w') as file:
        json.dump(dataset_info, file)

def generate_main_file(workflow_steps, selected_step_numbers, csv_path):
    main_code = "import pandas as pd\n\n"
    for step in selected_step_numbers:
        main_code += f"from step_{step} import step_{step}\n"
    
    # Add predefined parameters
    main_code += f"\nSizeSegment = {SizeSegment}\n"
    main_code += f"gamma = '{gamma}'\n"
    main_code += f"nu = {nu}\n"
    main_code += f"kernel = '{kernel}'\n"
    main_code += f"NC_pca = {NC_pca}\n"
    main_code += f"Dec_levels = {Dec_levels}\n\n"

    main_code += "def main():\n"
    main_code += f"    csv_path = '{csv_path}'\n"

    for step in selected_step_numbers:
        input_params = ", ".join(workflow_steps[step]['input'])
        output_params = ", ".join(workflow_steps[step]['output'])
        main_code += f"    {output_params} = step_{step}({input_params})\n"

    main_code += "    print(f'Precision on training data: {Prec_learn:.2f}')\n"
    main_code += "    print(f'Precision on test data: {Prec_test:.2f}')\n"

    main_code += "\nif __name__ == '__main__':\n"
    main_code += "    main()"

    with open("main.py", "w") as file:
        file.write(main_code)

def main():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/Thesis/datasets/complicated_case/learning-file_2.csv"
    info_file_path = "dataset_info.json"

    if not os.path.exists(info_file_path):
        save_dataset_info(csv_path, info_file_path)

    with open(info_file_path, 'r') as f:
        dataset_info = json.load(f)

    selected_step_numbers = [10, 20, 30, 40, 50]
    for step in selected_step_numbers:
        try:
            code_snippet = generate_code_for_step(step, workflow_steps, csv_path, dataset_info)
            with open(f"step_{step}.py", "w") as file:
                file.write(code_snippet)
            generate_validation_file(step, workflow_steps)

            success, output = validate_unit_code(f"validate_step_{step}.py")
            while not success:
                logger.info(f"Validation failed for step {step}: {output}")
                fixed_code = fix_code(code_snippet, output, csv_path)
                with open(f"step_{step}.py", "w") as file:
                    file.write(fixed_code)
                generate_validation_file(step, workflow_steps)
                success, output = validate_unit_code(f"validate_step_{step}.py")
        except Exception as e:
            logger.error(f"Error processing step {step}: {e}")
            continue

    logger.info("Validation completed successfully.")

    generate_main_file(workflow_steps, selected_step_numbers, csv_path)

    # Validate the main script
    success, output = validate_unit_code("main.py")
    if success:
        logger.info("Main script validated successfully.")
    else:
        logger.info(f"Validation failed for main script.")
        logger.info(f"Error: {output}")

if __name__ == "__main__":
    main()

[32m2024-07-31 23:10:42.346[0m | [1mINFO    [0m | [36m__main__[0m:[36mopenai_chat[0m:[36m136[0m - [1mFull API response: {'id': 'gen-rseWPwRQeyHK7RWQxV1Nw7kPjjpj', 'model': 'meta-llama/llama-3-70b-instruct', 'object': 'chat.completion', 'created': 1722453038, 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "Here is the function definition for `step_10`:\n```\nimport pandas as pd\nimport numpy as np\nfrom sklearn.preprocessing import MinMaxScaler\n\ndef step_10(csv_path, SizeSegment):\n    df = pd.read_csv(csv_path)\n    signal = df['signal'].values\n    Segments = [signal[i:i+SizeSegment] for i in range(0, len(signal), SizeSegment)]\n    Segments_normalized = [MinMaxScaler().fit_transform(segment.reshape(-1, 1)).reshape(-1) for segment in Segments]\n    return Segments_normalized\n```"}, 'finish_reason': 'stop', 'logprobs': {'tokens': None, 'token_logprobs': None, 'top_logprobs': None, 'text_offset': None}}], 'usage': {'prompt_tokens': 692, 'completion_to