Adding ParallelSkillSet & tests (#31)

HumanSignal · Nov 10, 2023 · f5ba24c · f5ba24c
1 parent 8ac8cc9
commit f5ba24c
Show file tree

Hide file tree

Showing 2 changed files with 208 additions and 25 deletions.
diff --git a/adala/skills/skillset.py b/adala/skills/skillset.py
@@ -29,11 +29,11 @@ def apply(
         self,
         dataset: Union[Dataset, InternalDataFrame],
         runtime: Runtime,
-        improved_skill: Optional[str] = None
+        improved_skill: Optional[str] = None,
     ) -> InternalDataFrame:
         """
         Apply the skill set to a dataset using a specified runtime.
-        
+
         Args:
             dataset (Union[Dataset, InternalDataFrame]): The dataset to apply the skill set to.
             runtime (Runtime): The runtime environment in which to apply the skills.
@@ -43,7 +43,9 @@ def apply(
         """
 
     @abstractmethod
-    def select_skill_to_improve(self, accuracy: Mapping, accuracy_threshold: Optional[float] = 1.0) -> Optional[BaseSkill]:
+    def select_skill_to_improve(
+        self, accuracy: Mapping, accuracy_threshold: Optional[float] = 1.0
+    ) -> Optional[BaseSkill]:
         """
         Select skill to improve based on accuracy.
 
@@ -90,12 +92,12 @@ class LinearSkillSet(SkillSet):
     """
     Represents a sequence of skills that are acquired in a specific order to achieve a goal.
 
-    LinearSkillSet ensures that skills are developed in a sequential manner, determined either 
+    LinearSkillSet ensures that skills are developed in a sequential manner, determined either
     by the provided skill_sequence or by the lexicographical order of skill names.
 
     Attributes:
         skills (Union[List[str], Dict[str, str], List[BaseSkill], Dict[str, BaseSkill]]): Provided skills
-        skill_sequence (List[str], optional): Ordered list of skill names indicating the order 
+        skill_sequence (List[str], optional): Ordered list of skill names indicating the order
                                               in which they should be acquired.
                                               By default, lexographical order of skill names is used.
         input_data_field (Optional[str], optional): Name of the input data field. Defaults to None.
@@ -117,8 +119,11 @@ class LinearSkillSet(SkillSet):
     skill_sequence: List[str] = None
     input_data_field: Optional[str] = None
 
-    @field_validator('skills', mode='before')
-    def skills_validator(cls, v: Union[List[str], List[BaseSkill], Dict[str, BaseSkill]]) -> Dict[str, BaseSkill]:
+    @field_validator("skills", mode="before")
+    @classmethod
+    def skills_validator(
+        cls, v: Union[List[str], List[BaseSkill], Dict[str, BaseSkill]]
+    ) -> Dict[str, BaseSkill]:
         """
         Validates and converts the skills attribute to a dictionary of skill names to BaseSkill instances.
 
@@ -140,7 +145,7 @@ def skills_validator(cls, v: Union[List[str], List[BaseSkill], Dict[str, BaseSki
                 skills[skill_name] = LLMSkill(
                     name=skill_name,
                     instructions=instructions,
-                    input_data_field=input_data_field
+                    input_data_field=input_data_field,
                 )
                 # Linear skillset creates skills pipeline - update input_data_field for next skill
                 input_data_field = skill_name
@@ -150,7 +155,7 @@ def skills_validator(cls, v: Union[List[str], List[BaseSkill], Dict[str, BaseSki
                 skills[skill_name] = LLMSkill(
                     name=skill_name,
                     instructions=instructions,
-                    input_data_field=input_data_field
+                    input_data_field=input_data_field,
                 )
                 # Linear skillset creates skills pipeline - update input_data_field for next skill
                 input_data_field = skill_name
@@ -164,8 +169,8 @@ def skills_validator(cls, v: Union[List[str], List[BaseSkill], Dict[str, BaseSki
             raise ValueError(f"skills must be a list or dictionary, not {type(skills)}")
         return skills
 
-    @model_validator(mode='after')
-    def skill_sequence_validator(self) -> 'LinearSkillSet':
+    @model_validator(mode="after")
+    def skill_sequence_validator(self) -> "LinearSkillSet":
         """
         Validates and sets the default order for the skill sequence if not provided.
         
@@ -176,9 +181,11 @@ def skill_sequence_validator(self) -> 'LinearSkillSet':
             # use default skill sequence defined by lexicographical order
             self.skill_sequence = list(self.skills.keys())
         if len(self.skill_sequence) != len(self.skills):
-            raise ValueError(f"skill_sequence must contain all skill names - "
-                             f"length of skill_sequence is {len(self.skill_sequence)} "
-                             f"while length of skills is {len(self.skills)}")
+            raise ValueError(
+                f"skill_sequence must contain all skill names - "
+                f"length of skill_sequence is {len(self.skill_sequence)} "
+                f"while length of skills is {len(self.skills)}"
+            )
         return self
 
     def apply(
@@ -189,7 +196,7 @@ def apply(
     ) -> InternalDataFrame:
         """
         Sequentially applies each skill on the dataset, enhancing the agent's experience.
-        
+
         Args:
             dataset (Dataset): The dataset to apply the skills on.
             runtime (Runtime): The runtime environment in which to apply the skills.
@@ -201,7 +208,9 @@ def apply(
         predictions = None
         if improved_skill:
             # start from the specified skill, assuming previous skills have already been applied
-            skill_sequence = self.skill_sequence[self.skill_sequence.index(improved_skill):]
+            skill_sequence = self.skill_sequence[
+                self.skill_sequence.index(improved_skill) :
+            ]
         else:
             skill_sequence = self.skill_sequence
         for i, skill_name in enumerate(skill_sequence):
@@ -210,16 +219,14 @@ def apply(
             input_dataset = dataset if i == 0 else predictions
             print_text(f"Applying skill: {skill_name}")
             predictions = skill.apply(input_dataset, runtime)
-        
+
         return predictions
 
     def select_skill_to_improve(
-        self,
-        accuracy: Mapping,
-        accuracy_threshold: Optional[float] = 1.0
+        self, accuracy: Mapping, accuracy_threshold: Optional[float] = 0.9
     ) -> Optional[BaseSkill]:
         """
-        Selects the skill with the lowest accuracy to improve.
+        Selects the first skill in the sequence with accuracy below the threshold to improve.
 
         Args:
             accuracy (Mapping): Accuracy of each skill.
@@ -236,14 +243,105 @@ def __rich__(self):
         # TODO: move it to a base class and use repr derived from Skills
         text = f"[bold blue]Total Agent Skills: {len(self.skills)}[/bold blue]\n\n"
         for skill in self.skills.values():
-            text += f'[bold underline green]{skill.name}[/bold underline green]\n' \
-                    f'[green]{skill.instructions}[green]\n'
+            text += (
+                f"[bold underline green]{skill.name}[/bold underline green]\n"
+                f"[green]{skill.instructions}[green]\n"
+            )
         return text
 
 
 class ParallelSkillSet(SkillSet):
     """
     Represents a set of skills that are acquired simultaneously to reach a goal.
+
+    In a ParallelSkillSet, each skill can be developed independently of the others. This is useful
+    for agents that require multiple, diverse capabilities, or tasks where each skill contributes a piece of
+    the overall solution.
+
+    Examples: 
+        Create a ParallelSkillSet with a list of skills specified as BaseSkill instances
+        >>> from adala.skills import ParallelSkillSet, TextClassificationSkill, TextGenerationSkill
+        >>> skillset = ParallelSkillSet(skills=[TextClassificationSkill(name='Classify sentiment', instructions='Classify the sentiment'), TextGenerationSkill(name='Summarize text', instructions='Generate a summar')])
+
+        Create a ParallelSkillSet with a dictionary of skill names to BaseSkill instances
+        >>> from adala.skills import ParallelSkillSet, TextClassificationSkill, TextGenerationSkill
+        >>> skillset = ParallelSkillSet(skills={'sentiment_analysis': TextClassificationSkill(name='Classify sentiment', instructions='Classify the sentiment'),'text_summary': TextGenerationSkill(name='Summarize text', instructions='Generate a summary')})
     """
-
-    pass
+
+    @field_validator("skills", mode="before")
+    @classmethod
+    def skills_validator(
+        cls, v: Union[List[BaseSkill], Dict[str, BaseSkill]]
+    ) -> Dict[str, BaseSkill]:
+        """
+        Validates and converts the skills attribute to a dictionary of skill names to BaseSkill instances.
+
+        Args:
+            v (List[BaseSkill], Dict[str, BaseSkill]]): The skills attribute to validate.
+
+        Returns:
+            Dict[str, BaseSkill]: Dictionary mapping skill names to their corresponding BaseSkill instances.
+        """
+        skills = OrderedDict()
+        if not v:
+            return skills
+
+        if isinstance(v, list) and isinstance(v[0], BaseSkill):
+            # convert list of skill names to dictionary
+            for skill in v:
+                skills[skill.name] = skill
+        elif isinstance(v, dict):
+            skills = v
+        else:
+            raise ValidationError(
+                f"skills must be a list or dictionary, not {type(skills)}"
+            )
+        return skills
+
+    def apply(
+        self,
+        dataset: Union[Dataset, InternalDataFrame],
+        runtime: Runtime,
+        improved_skill: Optional[str] = None,
+    ) -> InternalDataFrame:
+        """
+        Applies each skill on the dataset, enhancing the agent's experience.
+
+        Args:
+            dataset (Dataset): The dataset to apply the skills on.
+            runtime (Runtime): The runtime environment in which to apply the skills.
+            improved_skill (Optional[str], optional): Unused in ParallelSkillSet. Defaults to None.
+        Returns:
+            InternalDataFrame: Skill predictions.
+        """
+        predictions = None
+
+        for i, skill_name in enumerate(self.skills.keys()):
+            skill = self.skills[skill_name]
+            # use input dataset for the first node in the pipeline
+            input_dataset = dataset if i == 0 else predictions
+            print_text(f"Applying skill: {skill_name}")
+            predictions = skill.apply(input_dataset, runtime)
+
+        return predictions
+
+    def select_skill_to_improve(
+        self, accuracy: Mapping, accuracy_threshold: Optional[float] = 0.9
+    ) -> Optional[BaseSkill]:
+        """
+        Selects the skill with the lowest accuracy to improve.
+
+        Args:
+            accuracy (Mapping): Accuracy of each skill.
+            accuracy_threshold (Optional[float], optional): Accuracy threshold. Defaults to 1.0.
+        Returns:
+            Optional[BaseSkill]: Skill to improve. None if no skill to improve.
+        """
+        skills_below_threshold = [
+            skill_name
+            for skill_name in self.skills.keys()
+            if accuracy[skill_name] < accuracy_threshold
+        ]
+        if skills_below_threshold:
+            weakest_skill_name = min(skills_below_threshold, key=accuracy.get)
+            return self.skills[weakest_skill_name]
diff --git a/tests/test_llm_parallel_skillset.py b/tests/test_llm_parallel_skillset.py
@@ -0,0 +1,85 @@
+import pandas as pd
+
+from utils import patching, PatchedCalls
+
+@patching(
+    target_function=PatchedCalls.OPENAI_MODEL_LIST.value,
+    data=[{'input': {}, 'output': {'data': [{'id': 'gpt-3.5-turbo-instruct'}]}}],
+)
+@patching(
+    target_function=PatchedCalls.GUIDANCE.value,
+    data=[
+        # Responses for the first text entry
+        {
+        'input': {"text_": "Apple's latest product, the iPhone 15, was released in September 2023."},
+        'output': {"predictions": ""}  # No person mentioned
+        },
+        {
+        'input': {"text_": "Barack Obama was the 44th president of the United States."},
+        'output': {"predictions": "Barack Obama"}
+        },
+        {
+        'input': {"text_": "Apple's latest product, the iPhone 15, was released in September 2023."},
+        'output': {"predictions": "iPhone 15"}
+        },
+        {
+        'input': {"text_": "Barack Obama was the 44th president of the United States."},
+        'output': {"predictions": ""}  # No product mentioned
+        },
+        {
+        'input': {"text_": "Apple's latest product, the iPhone 15, was released in September 2023."},
+        'output': {"predictions": "September 2023"}
+        },
+        {
+        'input': {"text_": "Barack Obama was the 44th president of the United States."},
+        'output': {"predictions": ""}  # No date mentioned
+        },
+        {
+        'input': {"text_": "Apple's latest product, the iPhone 15, was released in September 2023."},
+        'output': {"predictions": ""}  # No location mentioned
+        },
+        {
+        'input': {"text_": "Barack Obama was the 44th president of the United States."},
+        'output': {"predictions": "United States"}
+        }
+    ],
+    strict=False
+)
+def test_llm_parallel_skillset():
+    from adala.skills.skillset import ParallelSkillSet, LLMSkill
+    from adala.datasets import DataFrameDataset, InternalDataFrame
+    from adala.runtimes import OpenAIRuntime
+
+    skillset = ParallelSkillSet(
+        skills=[
+            LLMSkill(name="skill_person", instructions="Extract person's name", input_data_field="text"),
+            LLMSkill(name="skill_product", instructions="Extract product name", input_data_field="text"),
+            LLMSkill(name="skill_date", instructions="Extract date", input_data_field="text"),
+            LLMSkill(name="skill_location", instructions="Extract location", input_data_field="text"),
+        ]
+    )
+    dataset = DataFrameDataset(df=InternalDataFrame([
+        "Apple's latest product, the iPhone 15, was released in September 2023.",
+        "Barack Obama was the 44th president of the United States.",
+    ], columns=["text"]))
+    predictions = skillset.apply(
+        dataset=dataset,
+        runtime=OpenAIRuntime(verbose=True),
+    )
+
+    pd.testing.assert_frame_equal(InternalDataFrame.from_records([
+        {
+        'text': "Apple's latest product, the iPhone 15, was released in September 2023.",
+        'skill_person': "",  # No person mentioned
+        'skill_product': 'iPhone 15',
+        'skill_date': 'September 2023',
+        'skill_location': ""  # No location mentioned
+        },
+        {
+        'text': 'Barack Obama was the 44th president of the United States.',
+        'skill_person': 'Barack Obama',
+        'skill_product': "",  # No product mentioned
+        'skill_date': "",  # No date mentioned
+        'skill_location': 'United States'
+        }
+    ]), predictions)