In [None]:
import os # Handles file paths and directories
import json # Parses and loads JSON files for rules and configurations
import unittest # Framework for writing and running unit tests

In [None]:
# To load categorical data rules from JSON file
with open("analysis/data/derivedData/rules_categorical.json", "r") as file:
    categorical_data_rules = json.load(file)

In [None]:
# To load configuration file for dataset thresholds
with open("analysis/data/derivedData/config.json", "r") as config_file:
    config = json.load(config_file)

SMALL_THRESHOLD = config["dataset_thresholds"]["small_dataset"]
LARGE_THRESHOLD = config["dataset_thresholds"]["large_dataset"]

In [None]:
def validate_parameters(data_type, task, rules):
    """
    Validates input parameters for logic generation.

    Args:
        data_type (str): Type of data (e.g., 'Categorical').
        task (str): Task type (e.g., 'Classification', 'Clustering').
        rules (dict): The rules JSON for the respective data type.

    Raises:
        ValueError: If the task is invalid.

    Returns:
        None
    """
    valid_tasks = rules["tasks"].keys()
    if task not in valid_tasks:
        raise ValueError(f"Invalid task: {task}. Choose from {', '.join(valid_tasks)}.")

In [None]:
def validate_logic(models, condition=None, dataset_size=None):
    """
    Validates if the models fit the task-specific conditions.

    Args:
        models (list): List of models retrieved from the rules.
        condition (str, optional): Specific condition being validated.
        dataset_size (int, optional): Dataset size for validation.

    Returns:
        bool: True if validation passes, False otherwise.
    """
    if not models:
        print(f"Warning: No models fit the condition '{condition}' for dataset size '{dataset_size}'.")
        return False
    return True

In [None]:
def classification_logic(data, dataset_size):
    """
    Handles logic for classification tasks, incorporating dataset thresholds.

    Args:
        data (dict): Categorical data rules.
        dataset_size (int): Size of the dataset for classification.

    Returns:
        str: Logic text for classification tasks.
    """
    logic_text = "If the problem is a classification task:\n"
    classification_tasks = data["tasks"]["classification"]

    # To determine dataset size category
    if dataset_size < SMALL_THRESHOLD:
        size_category = "small_dataset"
    elif dataset_size > LARGE_THRESHOLD:
        size_category = "large_dataset"
    else:
        size_category = "medium_dataset"

    # Iterating through classification task rules
    for condition, sub_conditions in classification_tasks.items():
        if condition == size_category:  # Matching dataset size category
            logic_text += f"  If the dataset is categorized as {size_category}:\n"
        elif condition == "default":
            logic_text += "    Else:\n"
        else:
            logic_text += f"  If the task involves {condition} classification:\n"

        # Checking sub-conditions and apply models
        for sub_condition, models in sub_conditions.items():
            if sub_condition == "default":
                logic_text += "      Else:\n"
            else:
                logic_text += f"      If {sub_condition}:\n"
            for approach, model_list in models.items():
                if validate_logic(model_list, sub_condition, dataset_size):
                    logic_text += f"        Use {approach} models: {', '.join(model_list)}\n"
                else:
                    logic_text += f"        Warning: No suitable models found for {sub_condition}.\n"

    return logic_text

In [None]:
def clustering_logic(data):
    """
    Handles logic for clustering tasks.

    Args:
        data (dict): Categorical data rules.

    Returns:
        str: Logic text for clustering tasks.
    """
    logic_text = "If the problem is a clustering task:\n"
    clustering_tasks = data["tasks"]["clustering"]

    for condition, models in clustering_tasks.items():
        if condition == "default":
            logic_text += f"  Else:\n"
        else:
            logic_text += f"  If the clusters are {condition}:\n"
        for approach, model_list in models.items():
            if validate_logic(model_list, condition):
                logic_text += f"    Use {approach} models: {', '.join(model_list)}\n"
            else:
                logic_text += f"    Warning: No suitable models found for {condition}.\n"

    return logic_text

In [None]:
def dimensionality_reduction_logic(data):
    """
    Handles logic for dimensionality reduction tasks.

    Args:
        data (dict): Categorical data rules.

    Returns:
        str: Logic text for dimensionality reduction tasks.
    """
    logic_text = "If the problem is dimensionality reduction:\n"
    dimensionality_tasks = data["tasks"]["dimensionality_reduction"]

    for condition, models in dimensionality_tasks.items():
        if condition == "default":
            logic_text += f"  Else:\n"
        else:
            logic_text += f"  If {condition} is important:\n"
        for approach, model_list in models.items():
            if validate_logic(model_list, condition):
                logic_text += f"    Use {approach} models: {', '.join(model_list)}\n"
            else:
                logic_text += f"    Warning: No suitable models found for {condition}.\n"

    return logic_text

In [None]:
def generate_categorical_logic(data_type, task, dataset_size=None):
    """
    Generate ML/DL decision logic dynamically for categorical data tasks.

    Args:
        data_type (str): Type of data (e.g., 'Categorical').
        task (str): Task type (e.g., 'Classification', 'Clustering', 'Dimensionality Reduction').
        dataset_size (int, optional): Size of the dataset for validation.

    Returns:
        str: Decision logic text.
    """
    validate_parameters(data_type, task, categorical_data_rules) # To ensure inputs are valid

    if task == "classification":
        return classification_logic(categorical_data_rules, dataset_size)
    elif task == "clustering":
        return clustering_logic(categorical_data_rules)
    elif task == "dimensionality_reduction":
        return dimensionality_reduction_logic(categorical_data_rules)
    else:
        raise ValueError(f"Task {task} not supported for data type {data_type}.")

In [None]:
print(generate_categorical_logic("Categorical", "classification", dataset_size = 500))

If the problem is a classification task:
  If the task involves binary_classification classification:
      If requires_interpretability:
        Use ML models: Logistic Regression
      If small_dataset:
        Use ML models: K-Nearest Neighbors (KNN)
      If linear_decision_boundaries:
        Use ML models: Support Vector Machines (SVM)
      Else:
        Use DL models: Feedforward Neural Network
  If the task involves multi_class classification:
      If high_dimensional:
        Use ML models: Support Vector Machines (SVM)
      If text_data:
        Use ML models: Naive Bayes
      If requires_interpretability:
        Use ML models: Decision Tree Classifier
      Else:
        Use ML models: Random Forest Classifier
      If large_complex:
        Use DL models: Transformer-based Models



In [None]:
print(generate_categorical_logic("Categorical", "clustering"))

If the problem is a clustering task:
  If the clusters are distinct_clusters:
    Use ML models: K-Means Clustering
  If the clusters are hierarchical_structure:
    Use ML models: Hierarchical Clustering
  If the clusters are arbitrary_shapes:
    Use ML models: DBSCAN
  If the clusters are probabilistic_boundaries:
    Use ML models: Gaussian Mixture Models (GMM)
  Else:
    Use DL models: Autoencoder-based Clustering



In [None]:
print(generate_categorical_logic("Categorical", "dimensionality_reduction"))

If the problem is dimensionality reduction:
  If maximize_variance is important:
    Use ML models: Principal Component Analysis (PCA)
  If supervised_class_separation is important:
    Use ML models: Linear Discriminant Analysis (LDA)
  If local_structure is important:
    Use ML models: t-SNE
  If local_global_structure is important:
    Use ML models: UMAP
  Else:
    Use ML models: Independent Component Analysis (ICA)
    Use DL models: Variational Autoencoders (VAE)



In [None]:
# Test scenarios for logic validation
test_scenarios = [
    {
        "data_type": "Categorical",
        "task": "classification",
        "dataset_size": 500,  # Small dataset
        "expected_model": "Logistic Regression"
    },
    {
        "data_type": "Categorical",
        "task": "classification",
        "dataset_size": 5000,  # Medium dataset
        "expected_model": "Feedforward Neural Network"
    },
    {
        "data_type": "Categorical",
        "task": "clustering",
        "dataset_size": None,  # Dataset size not required for clustering
        "expected_model": "K-Means Clustering"
    },
    {
        "data_type": "Categorical",
        "task": "dimensionality_reduction",
        "dataset_size": None,  # Dataset size not required for dimensionality reduction
        "expected_model": "Principal Component Analysis (PCA)"
    }
]

In [None]:
class TestCategoricalLogic(unittest.TestCase):
    """
    Unit tests for validating the logic functions for categorical data.
    """

    def test_logic(self):
        """
        Test logic across multiple scenarios.

        This test dynamically iterates over predefined scenarios to validate model mapping.
        """
        for scenario in test_scenarios:
            with self.subTest(scenario=scenario):
                result = generate_categorical_logic(
                    data_type=scenario["data_type"],
                    task=scenario["task"],
                    dataset_size=scenario["dataset_size"]
                )
                self.assertIn(
                    scenario["expected_model"],
                    result,
                    f"Failed for {scenario['task']} with dataset size {scenario['dataset_size']}"
                )

    def test_invalid_task(self):
        """
        Test behavior for an invalid task.

        This test ensures that passing an invalid task raises a ValueError.
        """
        with self.assertRaises(ValueError):
            generate_categorical_logic("Categorical", "invalid_task")

In [None]:
if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)

.
----------------------------------------------------------------------
Ran 1 test in 0.003s

OK


In [None]:
class TestCategoricalLogic(unittest.TestCase):
    """
    Unit tests for validating the rules for categorical data logic.
    """

    def test_small_dataset_binary_classification(self):
        """
        Test whether a small dataset with binary classification maps to Logistic Regression.

        This ensures that the implemented rules correctly map binary classification
        with small datasets to Logistic Regression.
        """
        for scenario in test_scenarios:
            with self.subTest(scenario=scenario):
                # Generate logic using the defined function
                result = generate_categorical_logic(
                    data_type=scenario["data_type"],
                    task=scenario["task"],
                    dataset_size=scenario["dataset_size"]
                )
                # Validate that the expected model is in the result
                self.assertIn(
                    scenario["expected_model"],
                    result,
                    f"Expected model {scenario['expected_model']} not found for binary classification."
                )


In [None]:
if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)

.
----------------------------------------------------------------------
Ran 1 test in 0.006s

OK
