In [None]:
import os # Handles file paths and directories
import json # Parses and loads JSON files for rules and configurations
import unittest # Framework for writing and running unit tests

In [None]:
# To load numerical rules from the JSON file
with open("analysis/data/derivedData/rules_tabular.json", "r") as file:
    numerical_data_rules = json.load(file)

In [None]:
# To load configuration file for dataset size thresholds
with open("analysis/data/derivedData/config.json", "r") as config_file:
    config = json.load(config_file)

# Threshold values for small and large datasets
SMALL_THRESHOLD = config["dataset_thresholds"]["small_dataset"]
LARGE_THRESHOLD = config["dataset_thresholds"]["large_dataset"]

In [1]:
# Validation function for models
def validate_logic(models, condition=None):
    """
    Validates if the models fit the condition.

    Args:
        models (list): List of models retrieved from the rules.
        condition (str, optional): Specific condition being validated.

    Returns:
        bool: True if validation passes, False otherwise.
    """
    if not models:
        print(f"Warning: No models fit the condition '{condition}'.")
        return False
    return True

# Handles logic for flat conditions (no nested structure)
def handle_flat_conditions(sub_conditions, indent_level=2):
    """
    Handles flat ML/DL conditions for tabular tasks.
    """
    logic = ""
    indent = "  " * indent_level
    for approach, model_list in sub_conditions.items():
        logic += f"{indent}Use {approach} models: {', '.join(model_list)}\n"
    return logic

# Logic generator for tabular tasks
def generate_task_logic(data, task_name):
    """
    Handles logic dynamically for tabular data tasks.
    """
    logic_text = f"If the problem is {task_name.replace('_', ' ')}:\n"
    task_details = data["tasks"][task_name]

    for condition, sub_conditions in task_details.items():
        if condition == "default":
            logic_text += f"  Else:\n"
        else:
            logic_text += f"  If {condition}:\n"
        
        logic_text += handle_flat_conditions(sub_conditions, indent_level=2)

    return logic_text

# Main function for generating tabular data logic
def generate_tabular_logic(task):
    """
    Generate ML/DL decision logic dynamically for tabular data tasks.

    Args:
        task (str): Task type (e.g., 'tabular_classification', 'tabular_regression').

    Returns:
        str: Decision logic text.
    """
    valid_tasks = tabular_data_rules["tasks"].keys()
    if task not in valid_tasks:
        raise ValueError(f"Task '{task}' is not supported. Available tasks: {', '.join(valid_tasks)}")

    return generate_task_logic(tabular_data_rules, task)

# Example usage of tabular logic generation
print(generate_tabular_logic("tabular_classification"))
print(generate_tabular_logic("tabular_regression"))
print(generate_tabular_logic("feature_importance"))

# Function to validate model choices based on dataset characteristics
def validate_model_choice(task_rules, task_name, dataset_characteristics):
    """
    Validate model choices based on dataset characteristics and task rules.
    Raises warnings if no model fits the conditions defined in the rules.
    """
    if task_name not in task_rules["tasks"]:
        raise ValueError(f"Task '{task_name}' is not supported.")

    task_details = task_rules["tasks"][task_name]
    applicable_models = []

    # Check if any condition matches the dataset characteristics
    for condition, sub_conditions in task_details.items():
        if condition == "default":
            continue  # Default is handled later

        if condition in dataset_characteristics:
            applicable_models.extend(
                model
                for approach in sub_conditions.values()
                for model in approach
            )

    # Return default models only if some valid characteristics exist but no models match
    if not applicable_models and "default" in task_details:
        # Return default models only if characteristics are valid (but no specific matches)
        if any(cond in task_details for cond in dataset_characteristics):
            applicable_models.extend(
                model
                for approach in task_details["default"].values()
                for model in approach
            )

    if not applicable_models:
        print(f"Warning: No suitable models found for task '{task_name}' with characteristics {dataset_characteristics}.")
        return None

    return applicable_models

# Example validation for tabular data rules
dataset_characteristics_tabular = ["large_dataset"]
print("Tabular Classification:", validate_model_choice(tabular_data_rules, "tabular_classification", dataset_characteristics_tabular))

dataset_characteristics_tabular_empty = ["undefined_characteristic"]
print("Tabular Classification (No match):", validate_model_choice(tabular_data_rules, "tabular_classification", dataset_characteristics_tabular_empty))

# Unit tests for tabular data logic
class TestTabularLogic(unittest.TestCase):
    """
    Unit tests for the generate_tabular_logic function and model validation.
    """
    def test_tabular_classification_logic(self):
        """
        Test tabular classification logic.
        """
        result = generate_tabular_logic("tabular_classification")
        self.assertIn("Random Forest", result, "Expected 'Random Forest' in tabular classification logic.")
        self.assertIn("Shallow Fully Connected Networks", result, "Expected 'Shallow Fully Connected Networks' in default logic.")

    def test_tabular_regression_logic(self):
        """
        Test tabular regression logic.
        """
        result = generate_tabular_logic("tabular_regression")
        self.assertIn("Linear Regression", result, "Expected 'Linear Regression' in tabular regression logic.")

    def test_invalid_task(self):
        """
        Test behavior for an invalid task.
        """
        with self.assertRaises(ValueError):
            generate_tabular_logic("invalid_task")

    def test_model_validation(self):
        """
        Test model validation based on dataset characteristics.
        """
        dataset_characteristics = ["large_dataset"]
        result = validate_model_choice(tabular_data_rules, "tabular_classification", dataset_characteristics)
        self.assertIn("XGBoost", result, "Expected 'XGBoost' in the applicable models for tabular classification.")

    def test_invalid_characteristics(self):
        """
        Test model validation with invalid dataset characteristics.
        """
        dataset_characteristics = ["unknown_condition"]
        result = validate_model_choice(tabular_data_rules, "tabular_classification", dataset_characteristics)
        self.assertIsNone(result, "Expected no models to be found for unknown conditions.")

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)


.....
----------------------------------------------------------------------
Ran 5 tests in 0.016s

OK


If the problem is tabular classification:
  If small_dataset:
    Use ML models: Random Forest, Support Vector Machine (SVM)
  If large_dataset:
    Use ML models: Gradient Boosting Trees (GBT), XGBoost
  Else:
    Use DL models: Shallow Fully Connected Networks

If the problem is tabular regression:
  If small_dataset:
    Use ML models: Linear Regression, Gradient Boosting Trees (GBT)
  If large_dataset:
    Use ML models: Random Forest Regression
  Else:
    Use DL models: Deeper Fully Connected Networks

If the problem is feature importance:
  Else:
    Use ML models: Decision Trees, Explainable Boosting Machine (EBM)

Tabular Classification: ['Gradient Boosting Trees (GBT)', 'XGBoost']
Tabular Classification (No match): None
