In [3]:
import os # Handles file paths and directories
import json # Parses and loads JSON files for rules and configurations
import unittest # Framework for writing and running unit tests

In [None]:
# To load textual data rules from JSON file
with open("analysis/data/derivedData/rules_textual.json", "r") as file:
    textual_data_rules = json.load(file)

In [None]:
# To load configuration file for dataset thresholds
with open("analysis/data/derivedData/config.json", "r") as config_file:
    config = json.load(config_file)

# Threshold values for small and large datasets
SMALL_THRESHOLD = config["dataset_thresholds"]["small_dataset"]
LARGE_THRESHOLD = config["dataset_thresholds"]["large_dataset"]

In [8]:
def get_dataset_category(dataset_size):
    """
    Determines the dataset category based on its size.

    Args:
        dataset_size (int): The size of the dataset.

    Returns:
        str: Dataset category ('small_dataset', 'medium_dataset', or 'large_dataset').
    """
    if dataset_size < SMALL_THRESHOLD:
        return "small_dataset"
    elif dataset_size > LARGE_THRESHOLD:
        return "large_dataset"
    else:
        return "medium_dataset"


In [9]:
def validate_logic(models, condition=None, dataset_size=None):
    """
    Validates if the models fit the task-specific conditions.

    Args:
        models (list): List of models retrieved from the rules.
        condition (str, optional): Specific condition being validated.
        dataset_size (int, optional): Dataset size for validation.

    Returns:
        bool: True if validation passes, False otherwise.
    """
    if not models:
        print(f"Warning: No models fit the condition '{condition}' for dataset size '{dataset_size}'.")
        return False
    return True

In [10]:
def handle_flat_conditions(sub_conditions, indent_level=2):
    """
    Handles flat conditions for model selection.

    Args:
        sub_conditions (dict): Dictionary of model approaches and model lists.
        indent_level (int): Indentation level for formatted output.

    Returns:
        str: Generated logic for flat conditions.
    """
    logic = ""
    indent = "  " * indent_level
    for approach, model_list in sub_conditions.items():
        logic += f"{indent}Use {approach} model: {', '.join(model_list)}\n"
    return logic

In [15]:
def handle_nested_conditions(sub_conditions, indent_level=2):
    """
    Recursively handles nested conditions.

    Args:
        sub_conditions (dict): Dictionary of nested conditions and models.
        indent_level (int): Indentation level for formatting.

    Returns:
        str: Formatted logic for nested conditions.
    """
    logic = ""
    indent = "  " * indent_level
    for condition, models in sub_conditions.items():
        if isinstance(models, dict):  # To handle nested conditions
            logic += f"{indent}If {condition}:\n"
            logic += handle_nested_conditions(models, indent_level + 1)
        else:  # To handle flat conditions
            logic += handle_flat_conditions({condition: models}, indent_level)
    return logic

In [16]:
def generate_task_logic(data, task_name, dataset_size=None):
    """
    Dynamically generates logic for a given textual task.

    Args:
        data (dict): Textual data rules.
        task_name (str): Task name (e.g., 'text_classification').
        dataset_size (int, optional): Dataset size for validation.

    Returns:
        str: Generated logic for the given task.
    """
    logic_text = f"If the problem is {task_name.replace('_', ' ')}:\n"
    task_details = data["tasks"][task_name]

    for condition, sub_conditions in task_details.items():
        if condition == "default":
            logic_text += "  Else:\n"
        else:
            logic_text += f"  If {condition}:\n"

        # Validate and generate logic based on conditions
        if isinstance(sub_conditions, dict):
            if isinstance(next(iter(sub_conditions.values())), dict):  # Nested structure
                logic_text += handle_nested_conditions(sub_conditions)
            else: # Flat conditions
                logic_text += handle_flat_conditions(sub_conditions, indent_level=2)
        else:
            logic_text += handle_flat_conditions(sub_conditions, indent_level=2)

    return logic_text

In [17]:
def generate_textual_logic(data_type, task, dataset_size=None):
    """
    Generates ML/DL decision logic for textual tasks.

    Args:
        data_type (str): Type of data (e.g., 'Textual').
        task (str): Task type (e.g., 'text_classification').
        dataset_size (int, optional): Dataset size for validation.

    Returns:
        str: Decision logic text for the specified task.
    """
    # Validate if the task exists in the rules
    valid_tasks = textual_data_rules["tasks"].keys()
    if task not in valid_tasks:
        raise ValueError(f"Task '{task}' is not supported. Available tasks: {', '.join(valid_tasks)}")

    return generate_task_logic(textual_data_rules, task, dataset_size)

In [18]:
print(generate_textual_logic("Textual", "text_classification", dataset_size=500))

If the problem is text classification:
  If independent_features:
    Use ML model: Naive Bayes
  If small_dataset:
    Use DL model: Pre-trained BERT, DistilBERT
  If high_dimensional:
    Use ML model: Support Vector Machine (SVM)
  If requires_interpretability:
    Use DL model: Transformer-based Models (e.g., BERT, GPT)
  Else:
    Use DL model: Recurrent Neural Networks (RNN), LSTMs



In [19]:
print(generate_textual_logic("Textual", "language_modeling_and_generation", dataset_size=10000))

If the problem is language modeling and generation:
  If requires_low_latency:
    Use DL model: DistilGPT
  If requires_large_context_window:
    Use DL model: GPT, GPT-2, GPT-3



In [20]:
print(generate_textual_logic("Textual", "text_summarization"))

If the problem is text summarization:
  If extractive:
    If small_dataset:
      Use ML model: TextRank
    If default:
      Use DL model: BERTSum
  If abstractive:
    Use DL model: T5, GPT



In [21]:
print(generate_textual_logic("Textual", "named_entity_recognition"))

If the problem is named entity recognition:
  If fine_grained:
    Use ML model: Conditional Random Fields (CRF)
  If small_dataset:
    Use DL model: Pre-trained BERT
  Else:
    Use DL model: Train Transformers from Scratch



In [22]:
print(generate_textual_logic("Textual", "question_answering"))

If the problem is question answering:
  If requires_complex_reasoning:
    Use DL model: BERT, RoBERTa
  Else:
    Use DL model: DistilBERT



In [23]:
print(generate_textual_logic("Textual", "machine_translation"))

If the problem is machine translation:
  If requires_high_speed:
    Use DL model: MarianMT
  If requires_contextual_translation:
    Use DL model: T5, OpenNMT



In [24]:
print(generate_textual_logic("Textual", "topic_modeling"))

If the problem is topic modeling:
  If sparse_topics:
    Use ML model: Non-negative Matrix Factorization (NMF)
  Else:
    Use ML model: Latent Dirichlet Allocation (LDA)



In [25]:
print(generate_textual_logic("Textual", "text_to_speech"))

If the problem is text to speech:
  If requires_high_fidelity:
    Use DL model: WaveNet
  If requires_fast_processing:
    Use DL model: Tacotron 2, FastSpeech



In [26]:
print(generate_textual_logic("Textual", "speech_to_text"))

If the problem is speech to text:
  If requires_high_accuracy:
    Use DL model: DeepSpeech, Wav2Vec 2.0
  Else:
    Use DL model: Speech Transformers



In [37]:
# Test scenarios for logic validation
test_scenarios = [
    {
    "data_type": "Textual",
    "task": "named_entity_recognition",
    "dataset_size": 500,  # Small dataset
    "expected_model": "Conditional Random Fields (CRF)"
    },
    {
        "data_type": "Textual",
        "task": "text_summarization",
        "dataset_size": 10000,  # Large dataset
        "expected_model": "BERTSum"
    }
]

In [33]:
class TestTextualLogic(unittest.TestCase):
    """
    Unit tests to validate textual logic against JSON rules.
    """

    def test_logic(self):
        """
        Test logic dynamically across multiple scenarios.

        This test iterates over predefined test cases to validate model mappings.
        """
        for scenario in test_scenarios:
            with self.subTest(scenario=scenario):
                result = generate_textual_logic(
                    data_type=scenario["data_type"],
                    task=scenario["task"],
                    dataset_size=scenario["dataset_size"]
                )
                self.assertIn(
                    scenario["expected_model"],
                    result,
                    f"Failed for {scenario['task']} with dataset size {scenario['dataset_size']}"
                )

    def test_invalid_task(self):
        """
        Test behavior when an invalid task is provided.

        Ensures that unsupported tasks raise a ValueError.
        """
        with self.assertRaises(ValueError):
            generate_textual_logic("Textual", "invalid_task")

In [38]:
if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.004s

OK
