In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import logging

In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

In [3]:
def parse_json(json_path):
    """Parse the JSON file and return the relevant configuration."""
    with open(json_path, 'r') as file:
        config = json.load(file)
    return config

In [4]:
def load_and_preprocess_data(csv_path, feature_config):
    """Load the CSV and preprocess the data based on feature handling settings."""
    data = pd.read_csv(csv_path)

    for feature, settings in feature_config.items():
        if settings['is_selected']:
            if settings['feature_variable_type'] == 'numerical':
                if settings['feature_details']['missing_values'] == 'Impute':
                    impute_strategy = settings['feature_details'].get('impute_with', 'mean')
                    impute_value = settings['feature_details'].get('impute_value', 0)

                    if impute_strategy == 'Average of values':
                        imputer = SimpleImputer(strategy='mean')
                    elif impute_strategy == 'custom':
                        imputer = SimpleImputer(strategy='constant', fill_value=impute_value)

                    data[feature] = imputer.fit_transform(data[[feature]])

    return data

In [5]:
def reduce_features(data, target, method, config=None):
    """Reduce features based on the specified method."""
    if method == 'PCA':
        n_components = config.get('num_of_features_to_keep', 2)
        pca = PCA(n_components=n_components)
        reduced_data = pca.fit_transform(data)
        return pd.DataFrame(reduced_data)
    elif method == 'Tree-based':
        model = RandomForestRegressor(n_estimators=config.get('num_of_trees', 5))
        model.fit(data, target)
        importances = model.feature_importances_
        important_indices = np.argsort(importances)[-config.get('num_of_features_to_keep', 4):]
        return data.iloc[:, important_indices]
    elif method == 'Corr with Target':
        correlations = data.corrwith(target)
        important_features = correlations.abs().sort_values(ascending=False).head(config.get('num_of_features_to_keep', 4)).index
        return data[important_features]
    elif method == 'No Reduction':
        return data
    else:
        raise ValueError("Unknown reduction method")

In [9]:
def main():
    # File paths
    json_path = 'algoparams_from_ui.json'
    csv_path = 'iris.csv'

    # Parse JSON
    try:
        config = parse_json(json_path)
    except Exception as e:
        logging.error(f"Error parsing JSON file: {e}")
        return

    # Check if necessary keys exist in JSON
    if 'design_state_data' not in config:
        logging.error("'design_state_data' key is missing in the JSON.")
        return

    design_state_data = config['design_state_data']

    if 'session_info' not in design_state_data or 'dataset' not in design_state_data['session_info']:
        logging.error("Necessary keys like 'session_info' or 'dataset' are missing in 'design_state_data'.")
        return

    if 'feature_handling' not in design_state_data:
        logging.error("'feature_handling' key is missing in 'design_state_data'.")
        return

    # Load and preprocess data
    dataset_config = design_state_data['session_info']['dataset']
    feature_config = design_state_data['feature_handling']
    try:
        data = load_and_preprocess_data(csv_path, feature_config)
    except Exception as e:
        logging.error(f"Error loading and preprocessing data: {e}")
        return

    # Target variable
    target_config = design_state_data.get('target', {})
    target_column = target_config.get('target', None)

    if not target_column or target_column not in data.columns:
        logging.error("Target column is missing or not found in the dataset.")
        return

    target = data.pop(target_column)

    # Feature reduction
    feature_reduction_config = design_state_data.get('feature_reduction', {})
    reduction_method = feature_reduction_config.get('feature_reduction_method', 'No Reduction')
    try:
        reduced_data = reduce_features(data, target, reduction_method, feature_reduction_config)
    except Exception as e:
        logging.error(f"Error during feature reduction: {e}")
        return

    logging.info("Data preprocessing and feature reduction completed.")


In [10]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [11]:
# Map of algorithms to sklearn models
MODEL_MAP = {
    "RandomForestRegressor": RandomForestRegressor,
    "GBTRegressor": GradientBoostingRegressor,
    "LinearRegression": LinearRegression,
    "RidgeRegression": Ridge,
    "LassoRegression": Lasso,
    "DecisionTreeRegressor": DecisionTreeRegressor,
}

In [12]:
def get_models_and_params(config, prediction_type):
    """Parse JSON to create model objects and hyperparameter grids."""
    models = []
    for algo_name, algo_config in config['algorithms'].items():
        if algo_config['is_selected']:
            # Check if the algorithm matches the prediction type
            if prediction_type == "Regression" and "Regressor" in algo_name:
                model_class = MODEL_MAP.get(algo_name)
                if model_class:
                    # Prepare hyperparameter grid
                    params = {}
                    for param_key, param_values in algo_config.items():
                        if isinstance(param_values, list):
                            params[param_key] = param_values

                    models.append((algo_name, model_class(), params))

    return models

In [13]:
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    """Train models with GridSearchCV and evaluate on test data."""
    for model_name, model, param_grid in models:
        logging.info(f"Training model: {model_name}")

        # Create pipeline
        pipeline = Pipeline([("model", model)])

        # Hyperparameter tuning
        grid_search = GridSearchCV(pipeline, param_grid={"model__" + k: v for k, v in param_grid.items()}, cv=3, scoring='r2')
        grid_search.fit(X_train, y_train)

        # Evaluate
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        logging.info(f"Best Params for {model_name}: {grid_search.best_params_}")
        logging.info(f"{model_name} - R2 Score: {r2_score(y_test, y_pred):.4f}, RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")

In [16]:
def main():
    # File paths
    json_path = 'algoparams_from_ui.json'  # Make sure this file exists in the correct path
    csv_path = 'iris.csv'                 # Ensure the CSV file path is correct

    try:
        # Parse JSON
        config = parse_json(json_path)
    except Exception as e:
        logging.error(f"Error parsing JSON file: {e}")
        return

    # Check for design_state_data key
    if 'design_state_data' not in config:
        logging.error("'design_state_data' key is missing in the JSON.")
        return

    design_state_data = config['design_state_data']

    # Ensure session_info and dataset keys exist
    if 'session_info' not in design_state_data or 'dataset' not in design_state_data['session_info']:
        logging.error("Necessary keys like 'session_info' or 'dataset' are missing in 'design_state_data'.")
        return

    # Ensure feature_handling exists
    if 'feature_handling' not in design_state_data:
        logging.error("'feature_handling' key is missing in 'design_state_data'.")
        return

    # Load and preprocess data
    try:
        feature_config = design_state_data['feature_handling']
        data = load_and_preprocess_data(csv_path, feature_config)
    except Exception as e:
        logging.error(f"Error loading and preprocessing data: {e}")
        return

    # Target variable handling
    target_config = design_state_data.get('target', {})
    target_column = target_config.get('target', None)

    if not target_column or target_column not in data.columns:
        logging.error("Target column is missing or not found in the dataset.")
        return

    target = data.pop(target_column)

    # Split data
    try:
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
    except Exception as e:
        logging.error(f"Error splitting data: {e}")
        return

    # Feature reduction
    feature_reduction_config = design_state_data.get('feature_reduction', {})
    reduction_method = feature_reduction_config.get('feature_reduction_method', 'No Reduction')

    try:
        reduced_X_train = reduce_features(X_train, y_train, reduction_method, feature_reduction_config)
        reduced_X_test = reduce_features(X_test, y_test, reduction_method, feature_reduction_config)
    except Exception as e:
        logging.error(f"Error during feature reduction: {e}")
        return

    # Get prediction type and models
    prediction_type = target_config.get('prediction_type', 'Regression')
    try:
        models = get_models_and_params(design_state_data, prediction_type)
    except Exception as e:
        logging.error(f"Error creating model objects: {e}")
        return

    # Train and evaluate models
    try:
        train_and_evaluate(models, reduced_X_train, y_train, reduced_X_test, y_test)
    except Exception as e:
        logging.error(f"Error training and evaluating models: {e}")
