In [16]:
import os # Handles file paths and directories
import json # Parses and loads JSON files for rules and configurations
import unittest # Framework for writing and running unit tests

In [None]:
# To load numerical rules from the JSON file
with open("analysis/data/derivedData/rules_numerical.json", "r") as file:
    numerical_data_rules = json.load(file)

In [None]:
# To load configuration file for dataset size thresholds
with open("analysis/data/derivedData/config.json", "r") as config_file:
    config = json.load(config_file)

# Threshold values for small and large datasets
SMALL_THRESHOLD = config["dataset_thresholds"]["small_dataset"]
LARGE_THRESHOLD = config["dataset_thresholds"]["large_dataset"]

In [None]:
def validate_parameters(data_type, task):
    """
    Validates input parameters for logic generation.

    Args:
        data_type (str): Type of data.
        task (str): Task type.

    Raises:
        ValueError: If the task is invalid.
    """
    valid_tasks = numerical_data_rules["tasks"].keys()
    if task not in valid_tasks:
        raise ValueError(f"Invalid task: {task}. Choose from {', '.join(valid_tasks)}.")

In [None]:
def validate_logic(models, condition=None, dataset_size=None):
    """
    Validates if the models fit the task-specific conditions.

    Args:
        models (list): List of models retrieved from the rules.
        condition (str, optional): Specific condition being validated.
        dataset_size (int, optional): Dataset size for validation against thresholds.

    Returns:
        bool: True if validation passes, False otherwise.
    """
    if not models:
        print(f"Warning: No models fit the condition '{condition}' for dataset size '{dataset_size}'.")
        return False
    return True

In [None]:
def regression_logic(data, dataset_size=None):
    """
    Handles logic for regression tasks.

    Args:
        data (dict): Numerical data rules.
        dataset_size (int, optional): Dataset size for validation.

    Returns:
        str: Logic text for regression tasks.
    """
    logic_text = "If the problem is a regression task:\n"
    task_details = data["tasks"]["regression"]

    for condition, sub_conditions in task_details.items():
        logic_text += f"  If the condition is {condition}:\n"
        if isinstance(sub_conditions, dict): # To handle nested conditions
            for sub_condition, models in sub_conditions.items():
                if sub_condition == "requires_regularization":
                    logic_text += f"    If {sub_condition}:\n"
                    for regularization_type, reg_models in models.items():
                        if regularization_type == "default":
                            logic_text += "      Else:\n"
                        else:
                            logic_text += f"      If {regularization_type}:\n"
                        for approach, model_list in reg_models.items():
                            if validate_logic(model_list, regularization_type, dataset_size):
                                logic_text += f"        Use {approach} models: {', '.join(model_list)}\n"
                else:
                    logic_text += f"    If {sub_condition}:\n"
                    for approach, model_list in models.items():
                        if validate_logic(model_list, sub_condition, dataset_size):
                            logic_text += f"      Use {approach} models: {', '.join(model_list)}\n"
        else: # To handle flat conditions
            for approach, model_list in sub_conditions.items():
                if validate_logic(model_list, condition, dataset_size):
                    logic_text += f"    Use {approach} models: {', '.join(model_list)}\n"

    return logic_text

In [None]:
def clustering_logic(data):
    """
    Handles logic for clustering tasks.

    Args:
        data (dict): Numerical data rules.

    Returns:
        str: Logic text for clustering tasks.
    """
    logic_text = "If the problem is a clustering task:\n"
    task_details = data["tasks"]["clustering"]

    for condition, models in task_details.items():
        if validate_logic(models, condition=condition):
            logic_text += f"  If the condition is {condition}:\n"
            for approach, model_list in models.items():
                logic_text += f"    Use {approach} models: {', '.join(model_list)}\n"
    return logic_text

In [None]:
def dimensionality_reduction_logic(data):
    """
    Handles logic for dimensionality reduction tasks.

    Args:
        data (dict): Numerical data rules.

    Returns:
        str: Logic text for dimensionality reduction tasks.
    """
    logic_text = "If the problem is dimensionality reduction:\n"
    task_details = data["tasks"]["dimensionality_reduction"]

    for condition, models in task_details.items():
        if validate_logic(models, condition=condition):
            logic_text += f"  If the condition is {condition}:\n"
            for approach, model_list in models.items():
                logic_text += f"    Use {approach} models: {', '.join(model_list)}\n"
    return logic_text

In [None]:
def generate_numerical_logic(data_type, task, dataset_size=None):
    """
    Generate ML/DL decision logic dynamically based on input parameters.

    Args:
        data_type (str): Type of data (e.g., 'Numerical').
        task (str): Task type (e.g., 'Regression', 'Clustering', 'Dimensionality Reduction').
        dataset_size (int, optional): Size of the dataset for validation.

    Returns:
        str: Decision logic text.
    """
    validate_parameters(data_type, task) # To ensure inputs are valid
    if task == "regression":
        return regression_logic(numerical_data_rules, dataset_size)
    elif task == "clustering":
        return clustering_logic(numerical_data_rules)
    elif task == "dimensionality_reduction":
        return dimensionality_reduction_logic(numerical_data_rules)
    else:
        raise ValueError(f"Task {task} not supported for data type {data_type}.")

In [15]:
print(generate_numerical_logic(data_type="Numerical", task="regression", dataset_size=5000))

If the problem is a regression task:
  If the condition is linear:
    If two_variables:
      Use ML models: Simple Linear Regression
    If multiple_variables:
      Use ML models: Multiple Linear Regression
  If the condition is non_linear:
    If curve_fitting:
      Use ML models: Polynomial Regression
    If decision_boundaries:
      Use ML models: Support Vector Regression (SVR)
    If requires_regularization:
      If feature_selection:
        Use ML models: Lasso Regression
      If high_dimensional:
        Use ML models: Ridge Regression
      Else:
        Use ML models: Elastic Net Regression
    If default:
      Use ML models: Gradient Boosting
      Use DL models: Deep Neural Networks (DNN)



In [None]:
print(generate_numerical_logic("Numerical", "clustering"))

If the problem is a clustering task:
  If the condition is distinct_clusters:
    Use ML models: K-Means Clustering
  If the condition is hierarchical_structure:
    Use ML models: Hierarchical Clustering
  If the condition is arbitrary_shapes:
    Use ML models: DBSCAN
  If the condition is density_based:
    Use ML models: Mean Shift Clustering
  If the condition is default:
    Use ML models: Agglomerative Clustering
    Use DL models: Autoencoder-based Clustering



In [None]:
print(generate_numerical_logic("Numerical", "dimensionality_reduction"))

If the problem is dimensionality reduction:
  If the condition is maximize_variance:
    Use ML models: Principal Component Analysis (PCA)
  If the condition is supervised_class_separation:
    Use ML models: Linear Discriminant Analysis (LDA)
  If the condition is local_structure:
    Use ML models: t-SNE
  If the condition is local_global_structure:
    Use ML models: UMAP
  If the condition is default:
    Use ML models: Independent Component Analysis (ICA)
    Use DL models: Variational Autoencoders (VAE)



In [40]:
class TestNumericalLogic(unittest.TestCase):
    """
    Unit tests for the generate_numerical_logic function to validate the application of numerical rules.
    """
    def test_regression_small_dataset(self):
        """
        Test regression logic with a small dataset.

        This test checks whether a small dataset maps to the correct model (Simple Linear Regression).

        Args:
            None (Uses hardcoded dataset_size = 500 for small dataset).

        Raises:
            AssertionError: If the expected model is not found in the result.

        Returns:
            None
        """
        dataset_size = 500  # Small dataset
        result = generate_numerical_logic("Numerical", "regression", dataset_size)
        self.assertIn("Simple Linear Regression", result, "Small dataset should map to Simple Linear Regression.")

    def test_clustering_default_condition(self):
        """
        Test clustering logic for the default condition.

        This test validates that clustering logic correctly maps to the default model (Agglomerative Clustering).

        Args:
            None (No dataset size is used for this test case).

        Raises:
            AssertionError: If the expected model is not found in the result.

        Returns:
            None
        """
        result = generate_numerical_logic("Numerical", "clustering")
        self.assertIn("Agglomerative Clustering", result, "Default clustering should map to Agglomerative Clustering.")

    def test_invalid_task(self):
        """
        Test behavior for an invalid task.

        This test ensures that passing an invalid task raises a ValueError.

        Args:
            None (Uses hardcoded invalid task input).

        Raises:
            ValueError: If the task is invalid.

        Returns:
            None
        """
        with self.assertRaises(ValueError):
            generate_numerical_logic("Numerical", "invalid_task")

In [41]:
if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)

...
----------------------------------------------------------------------
Ran 3 tests in 0.008s

OK


In [42]:
test_scenarios = [
    {
        "data_type": "Numerical",
        "task": "regression",
        "dataset_size": 500,
        "expected_model": "Simple Linear Regression"
    },
    {
        "data_type": "Numerical",
        "task": "clustering",
        "dataset_size": None,  # Dataset size not required for clustering
        "expected_model": "Agglomerative Clustering"
    },
    {
        "data_type": "Numerical",
        "task": "dimensionality_reduction",
        "dataset_size": None,  # Dataset size not required for dimensionality reduction
        "expected_model": "Principal Component Analysis (PCA)"
    }
]

In [43]:
class TestNumericalLogic(unittest.TestCase):
    """
    Unit tests using data-driven testing for multiple scenarios.

    This test class iterates over predefined test cases (test_scenarios) to validate model mapping logic.
    """
    def test_logic(self):
        """
        Test logic across multiple scenarios.

        This test dynamically iterates over test scenarios and validates that the expected model is
        included in the generated logic result.

        Args:
            None (Uses predefined scenarios from test_scenarios).

        Raises:
            AssertionError: If the expected model is not found in the result.

        Returns:
            None
        """
        for scenario in test_scenarios:
            with self.subTest(scenario=scenario):
                result = generate_numerical_logic(
                    data_type=scenario["data_type"],
                    task=scenario["task"],
                    dataset_size=scenario["dataset_size"]
                )
                self.assertIn(scenario["expected_model"], result,
                              f"Failed for {scenario['task']} with dataset size {scenario['dataset_size']}")

    def test_invalid_task(self):
        """
        Test behavior for an invalid task.

        This test ensures that passing an invalid task raises a ValueError.

        Args:
            None (Uses hardcoded invalid task input).

        Raises:
            ValueError: If the task is invalid.

        Returns:
            None
        """
        with self.assertRaises(ValueError):
            generate_numerical_logic("Numerical", "invalid_task")

In [44]:
if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.007s

OK
