# Imports

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.decomposition import PCA
from loguru import logger

from graph import Step
from language_modeling import OpenAiLlamaApi, LlamaModel, PromptGenerator
from code_generation import ValidationCodeGenerator, MainCodeGenerator
from orchestrator import Orchestrator
from utils import get_dataset_info
from pathlib import Path

# Configure logger
logger.add("execution.log", rotation="500 MB")

1

# Constants

In [2]:
EXAMPLE_STEP_SCRIPT = """
import pandas as pd
import pywt
from sklearn.preprocessing import StandardScaler

def step_40(Segments_normalized, Dec_levels):
    Features = []
    for segment in Segments_normalized:
        coeffs = pywt.wavedec(segment, 'db4', level=Dec_levels)
        features = [coefficient.mean() for coefficient in coeffs]
        Features.append(features)
    return StandardScaler().fit_transform(Features)
"""

MODEL_TAG = "meta-llama/llama-3-70b-instruct"

SIMPLE_CASE_CSV_PATH = "/Users/ilya/Desktop/CodeGeneration-main/datasets/insurance.csv"
COMPLICATED_CASE_CSV_PATH = "/Users/ilya/Desktop/CodeGeneration-main/datasets/learning-file_2.csv"

PYTHON_PATH = "/Users/ilya/thesis_env/myenv/bin/python"

# Graphs

In [3]:
steps_simple_case = [
    Step(
        step_id="11",
        description="Load the CSV file as pandas DataFrame",
        dependencies=[],
        input_vars=["csv_path"],
        output_vars=["df"],
        additional_info=""
    ),
    Step(
        step_id="21",
        description="Examine the structure and characteristics of the data",
        dependencies=["11"],
        input_vars=["df"],
        output_vars=["structure_info"],
        additional_info=""
    ),
    Step(
        step_id="31",
        description="Identify missing values, data types, and handle missing values if there are any",
        dependencies=["11", "21"],
        input_vars=["df"],
        output_vars=["df_cleaned", "data_types_info"],
        additional_info=""
    ),
    Step(
        step_id="41",
        description="Identify if there is a need to convert categorical variables to numerical representations. If yes, then convert them.",
        dependencies=["11", "31"],
        input_vars=["df_cleaned", "data_types_info"],
        output_vars=["df_encoded"],
        additional_info=""
    ),
    Step(
        step_id="51",
        description="Split the preprocessed data into training and testing sets, and implement a machine learning algorithm (choose from scikit-learn, XGBoost, LightGBM, or CatBoost).",
        dependencies=["11", "31", "41"],
        input_vars=["df_encoded"],
        output_vars=["model", "X_train", "X_test", "y_train", "y_test"],
        additional_info=""
    ),
    Step(step_id="61",
        description="Evaluate the model's performance on both training and testing data, calculate evaluation metrics (for classification: [accuracy, precision, recall, F1-score]; for regression: [R^2, MSE, RMSE]), and compare the difference.",
        dependencies=["51"],
        input_vars=["model", "X_train", "X_test", "y_train", "y_test"],
        output_vars=["evaluation_results", "metrics"],
        additional_info=""
    )
]

In [4]:
steps_complicated_case = [
    Step(
        step_id="10",
        description="Import raw data from CSV and segment it",
        dependencies=[],
        input_vars=["csv_path", "SizeSegment"],
        output_vars=["Segments"],
        additional_info="Use pandas to read the CSV and create segments of size SizeSegment."
    ),
    Step(
        step_id="20",
        description="Normalize the segmented data using MinMaxScaler",
        dependencies=["10"],
        input_vars=["Segments"],
        output_vars=["Segments_normalized"],
        additional_info="Segments is a list of 1D numpy arrays. Each segment should be normalized independently."
    ),
    Step(
        step_id="30",
        description="Extract features using wavelet decomposition",
        dependencies=["20"],
        input_vars=["Segments_normalized", "Dec_levels"],
        output_vars=["Features"],
        additional_info="Use pywavelets (pywt) library with 'db3' wavelet and specified Dec_levels."
    ),
    Step(
        step_id="40",
        description="Apply PCA for dimension reduction",
        dependencies=["30"],
        input_vars=["Features", "NC_pca"],
        output_vars=["PCA_Features", "pca"],
        additional_info="Use sklearn's PCA. Return both the transformed features and the PCA object."
    ),
    Step(step_id="50",
        description="Train model, evaluate, and calculate metrics",
        dependencies=["40"],
        input_vars=["PCA_Features", "kernel", "nu", "gamma"],
        output_vars=["FittedClassifier", "Prec_learn", "Prec_test"],
        additional_info="""
        1. Create labels: np.ones for learning data.
        2. Split data into train and test sets (80% train, 20% test).
        3. Create and fit a One-Class SVM classifier using sklearn.
        4. Predict labels for training data.
        5. Calculate error rate for training data.
        6. Predict labels for test data (assume all test data as anomaly, i.e., -1).
        7. Calculate error rate for test data.
        8. Calculate precision as 1 - error_rate for both training and test.
        Return the fitted classifier and both precision values.
        """
    )
]

# Credentials

In [5]:
with open('env.json', 'r') as f:
    credentials_dict = json.load(f)

API_URL = "https://openrouter.ai/api/v1"
API_KEY = credentials_dict["OPENROUTER_API_KEY"]

# Commons

In [6]:
llama_api = OpenAiLlamaApi(API_URL, API_KEY, MODEL_TAG)
model = LlamaModel(llama_api)
validation_code_genrator = ValidationCodeGenerator()


# Simple case

In [7]:
csv_path = SIMPLE_CASE_CSV_PATH
raw_data = pd.read_csv(csv_path)
dataset_info_simple_case = get_dataset_info(raw_data)


parameters_simple_case = {
    'csv_path': f"'{csv_path}'",
}

In [8]:
main_code_generator = MainCodeGenerator(additional_lines=[])
prompt_generator = PromptGenerator(EXAMPLE_STEP_SCRIPT, dataset_info_simple_case)
orchestrator = Orchestrator(
    model,
    prompt_generator,
    validation_code_genrator,
    main_code_generator,
    'out_simple_case',
    PYTHON_PATH
)

In [9]:
orchestrator.run_steps(steps_simple_case, parameters_simple_case)

[32m2024-08-14 22:59:14.096[0m | [1mINFO    [0m | [36mlanguage_modeling[0m:[36mexecute_request[0m:[36m51[0m - [1mFull API response: {'id': 'gen-PQRpqxdpNL3LoQb84rKl7cozxlyp', 'model': 'meta-llama/llama-3-70b-instruct', 'object': 'chat.completion', 'created': 1723661952, 'choices': [{'logprobs': None, 'finish_reason': 'eos', 'index': 0, 'message': {'role': 'assistant', 'content': 'Here is the Python function named \'step_11\' to load the CSV file as a pandas DataFrame:\n```\nimport pandas as pd\n\ndef step_11(csv_path):\n    try:\n        df = pd.read_csv(csv_path)\n        return df\n    except FileNotFoundError:\n        print("File not found. Please check the file path.")\n        return None\n```'}}], 'usage': {'prompt_tokens': 1068, 'completion_tokens': 74, 'total_tokens': 1142}}[0m
[32m2024-08-14 22:59:14.102[0m | [1mINFO    [0m | [36morchestrator[0m:[36mgenerate_step_file[0m:[36m154[0m - [1mgenerated step source for step 11, filename: step_11.py,
source:
 


# Complicated case

### The below code for complicated case is commented out because it's not possible to provide a dataset for it since it's private. So, will not be able to run it. To test it you need to request the original dataset.

In [10]:
# csv_path = COMPLICATED_CASE_CSV_PATH
# raw_data = pd.read_csv(csv_path)
# dataset_info_compicated_case = get_dataset_info(raw_data)
# signal_data = raw_data['signal'].values


# SizeSegment = min(512, len(signal_data) // 100)
# gamma = 'scale'
# nu = 0.1
# kernel = "rbf"

# # PCA
# pca = PCA().fit(signal_data.reshape(-1, 1))
# cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
# NC_pca = np.argmax(cumulative_variance_ratio >= 0.95) + 1

# Dec_levels = int(np.log2(SizeSegment)) - 3

# parameters_complicated_case = {
#     'csv_path': f"'{csv_path}'",
#     "SizeSegment": f"{SizeSegment}",
#     "gamma": f"'{gamma}'",
#     "nu": f"{nu}",
#     "kernel" : f"'{kernel}'",
#     "NC_pca": f"{NC_pca}",
#     "Dec_levels": f"{Dec_levels}",
# }

In [11]:
# main_code_generator = main_code_generator = MainCodeGenerator([
#         "print(f'Precision on training data: {Prec_learn:.2f}')",
#         "print(f'Precision on test data: {Prec_test:.2f}')"
# ])
# prompt_generator = PromptGenerator(EXAMPLE_STEP_SCRIPT, dataset_info_compicated_case)

# orchestrator = Orchestrator(
#     model,
#     prompt_generator,
#     validation_code_genrator,
#     main_code_generator,
#     'out_complicated_case',
#     PYTHON_PATH
# )

In [12]:
# orchestrator.run_steps(steps_complicated_case, parameters_complicated_case)