LAAC-LSCP · marianne-m · Feb 24, 2022 · Feb 28, 2022 · Mar 1, 2022 · Mar 1, 2022
diff --git a/ChildProject/annotations.py b/ChildProject/annotations.py
@@ -265,6 +265,7 @@ def __init__(self, project: ChildProject):
         """
         self.project = project
         self.annotations = None
+        self.controlled_voc = pd.DataFrame()
         self.errors = []
 
         if not isinstance(project, ChildProject):
@@ -309,6 +310,10 @@ def read(self) -> Tuple[List[str], List[str]]:
                 ]
             )
 
+        controlled_voc_path = os.path.join(self.project.path, "metadata/controlled_vocabulary.csv")
+        if os.path.exists(controlled_voc_path):
+            self.controlled_voc = pd.read_csv(controlled_voc_path, index_col=0)
+
         return errors, warnings
 
     def validate_annotation(self, annotation: dict) -> Tuple[List[str], List[str]]:
@@ -385,6 +390,16 @@ def write(self):
             os.path.join(self.project.path, "metadata/annotations.csv"), index=False
         )
 
+    def write_controlled_vocabulary(self, imported_controlled_vocabulary: pd.DataFrame):
+        """
+        Update the controlled vocabulary of EAF files
+        """
+        self.controlled_voc = pd.concat([self.controlled_voc, imported_controlled_vocabulary])
+        self.controlled_voc = self.controlled_voc.astype(str).drop_duplicates()
+        self.controlled_voc.to_csv(
+            os.path.join(self.project.path, "metadata/controlled_vocabulary.csv"), index=True
+        )
+
     def _import_annotation(
         self, import_function: Callable[[str], pd.DataFrame], params: dict, annotation: dict
     ):
@@ -480,6 +495,10 @@ def _import_annotation(
         )
         annotation["package_version"] = __version__
 
+        if annotation_format == "eaf":
+            controlled_voc = EafConverter.get_controlled_vocabulary(path)
+            self.write_controlled_vocabulary(controlled_voc)
+
         if pd.isnull(annotation["format"]):
             annotation["format"] = "NA"
 

diff --git a/ChildProject/converters.py b/ChildProject/converters.py
@@ -440,7 +440,30 @@ class EafConverter(AnnotationConverter):
     FORMAT = "eaf"
 
     @staticmethod
-    def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
+    def get_controlled_vocabulary(filename: str) -> pd.DataFrame:
+        import pympi
+
+        eaf = pympi.Elan.Eaf(filename)
+
+        controlled_voc = pd.DataFrame.from_dict(
+                    eaf.controlled_vocabularies,
+                    orient='index',
+                    columns=['description', 'entries', 'external_references']
+                )
+
+        controlled_voc['description'] = \
+            controlled_voc['description'].transform(lambda x: x[0][1] if len(x) > 0 else x)
+        controlled_voc['entries'] = \
+            controlled_voc['entries'] \
+                .transform(
+                    lambda x: [value[0][0] if len(value) > 0 else value for value in x.values()]
+                )
+
+        return controlled_voc
+
+
+    @staticmethod
+    def convert(filename: str, filter: str=None, new_tiers: list=None, **kwargs) -> pd.DataFrame:
         import pympi
 
         eaf = pympi.Elan.Eaf(filename)
@@ -523,8 +546,19 @@ def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
                     segment["vcm_type"] = value
                 elif label == "msc":
                     segment["msc_type"] = value
-                elif label in kwargs["new_tiers"]:
-                    segment[label] = value
+                elif new_tiers is not None and label in new_tiers:
+                    if label in eaf.controlled_vocabularies.keys():
+                        controlled_values = \
+                            [value[0][0][0] for value in eaf.controlled_vocabularies[label][1].values()]
+                        if value not in controlled_values:
+                            print(
+                                f'warning: {value} is not in the controlled'
+                                f'vocabulary {controlled_values} for {label}'
+                            )
+                        else:
+                            segment[label] = value
+                    else:
+                        segment[label] = value
 
         return pd.DataFrame(segments.values())
 

diff --git a/docs/source/api-annotations.rst b/docs/source/api-annotations.rst
@@ -223,9 +223,17 @@ When importing EAF annotation files, some tiers are supported by ChildProject, s
 If you want to import a tier that is not supported by ChildProject, you can use
 :meth:`~ChildProject.annotations.AnnotationManager.import_annotations` as follows :
 
-.. code-block:: python
+..code-block:: python
+
+    >>> am.import_annotations(input, new_tiers = ['name_of_tier'])
+
+If a controlled vocabulary is added in the EAF annotation file for this new tier, the values
+of the annotations are checked. If a value is not in the controlled vocabulary, it is not
+written in the annotation file, and a warning is thrown.
+Moreover, the ``metadata/controlled_vocabulary.csv`` dataframe in metadata is either created
+with the available controlled vocabularies or updated with this new tier.
 
-    >>> am.import_annotations(input, new_tier = ['name_of_tier'])
+If no controlled vocabulary is added in the EAF annoation file, the values are not checked.
 
 Validating annotations
 ~~~~~~~~~~~~~~~~~~~~~~

diff --git a/tests/data/eaf_any_tier.eaf b/tests/data/eaf_any_tier.eaf
@@ -1612,7 +1612,7 @@
     <TIER LINGUISTIC_TYPE_REF="NEWTIER2" PARENT_REF="CHI" PARTICIPANT="CHI" TIER_ID="newtier2@CHI">
         <ANNOTATION>
             <REF_ANNOTATION ANNOTATION_ID="a257" ANNOTATION_REF="a256" CVE_REF="cveid_09a9bb98-31a9-4afd-9ed7-d4fc7af658a6">
-                <ANNOTATION_VALUE>W</ANNOTATION_VALUE>
+                <ANNOTATION_VALUE>bad_value</ANNOTATION_VALUE>
             </REF_ANNOTATION>
         </ANNOTATION>
         <ANNOTATION>
@@ -2400,7 +2400,7 @@
     <TIER LINGUISTIC_TYPE_REF="NEWTIER" PARENT_REF="CHI" PARTICIPANT="CHI" TIER_ID="newtier@CHI">
         <ANNOTATION>
             <REF_ANNOTATION ANNOTATION_ID="a582" ANNOTATION_REF="a256" CVE_REF="cveid_57c69cd9-b12d-49e9-a2ee-5782f3f1b867">
-                <ANNOTATION_VALUE>S</ANNOTATION_VALUE>
+                <ANNOTATION_VALUE>bad_value</ANNOTATION_VALUE>
             </REF_ANNOTATION>
         </ANNOTATION>
         <ANNOTATION>
@@ -3324,33 +3324,6 @@
     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
     <CONSTRAINT
         DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
-    <CONTROLLED_VOCABULARY CV_ID="newtier2">
-        <DESCRIPTION LANG_REF="und">Simplified subset of infant vocal maturity classes (distinguishing between variegated and non-variegated syllables)</DESCRIPTION>
-        <CV_ENTRY_ML CVE_ID="cveid_e7300257-f12a-479f-90f0-c2fefbf99a26">
-            <CVE_VALUE DESCRIPTION="Crying" LANG_REF="und">Y</CVE_VALUE>
-        </CV_ENTRY_ML>
-        <CV_ENTRY_ML CVE_ID="cveid_ae00bfde-d4bb-499e-8c63-81c4459f5b8a">
-            <CVE_VALUE DESCRIPTION="Laughing" LANG_REF="und">L</CVE_VALUE>
-        </CV_ENTRY_ML>
-        <CV_ENTRY_ML CVE_ID="cveid_df01bf24-04f4-4cff-9bc4-ca92a0ca945f">
-            <CVE_VALUE
-                DESCRIPTION="Non-canonical non-variegated syllable(s)" LANG_REF="und">A</CVE_VALUE>
-        </CV_ENTRY_ML>
-        <CV_ENTRY_ML CVE_ID="cveid_8675a2cf-bb35-476c-a602-8b911eb2a845">
-            <CVE_VALUE
-                DESCRIPTION="Non-canonical variegated syllable(s)" LANG_REF="und">P</CVE_VALUE>
-        </CV_ENTRY_ML>
-        <CV_ENTRY_ML CVE_ID="cveid_f1ad7cdd-4916-4914-a59a-a33d0d7052cc">
-            <CVE_VALUE DESCRIPTION="Canonical variegated syllable(s)" LANG_REF="und">V</CVE_VALUE>
-        </CV_ENTRY_ML>
-        <CV_ENTRY_ML CVE_ID="cveid_09a9bb98-31a9-4afd-9ed7-d4fc7af658a6">
-            <CVE_VALUE
-                DESCRIPTION="Canonical non-variegated syllable(s)" LANG_REF="und">W</CVE_VALUE>
-        </CV_ENTRY_ML>
-        <CV_ENTRY_ML CVE_ID="cveid_ee07af47-c822-4fb3-80d3-d842d80272b7">
-            <CVE_VALUE DESCRIPTION="Uncertain" LANG_REF="und">U</CVE_VALUE>
-        </CV_ENTRY_ML>
-    </CONTROLLED_VOCABULARY>
     <CONTROLLED_VOCABULARY CV_ID="tocode">
         <DESCRIPTION LANG_REF="und">"y" if coder thinks the clip should be coded, "n" if not</DESCRIPTION>
         <CV_ENTRY_ML CVE_ID="cveid0">

diff --git a/tests/truth/eaf_any_tier.csv b/tests/truth/eaf_any_tier.csv
@@ -1,5 +1,5 @@
 ,segment_onset,segment_offset,speaker_id,speaker_type,vcm_type,lex_type,mwu_type,addressee,transcription,words,newtier2,newtier
-0,1519558,1519992,CHI,CHI,NA,NA,NA,NA,NA,NA,W,S
+0,1519558,1519992,CHI,CHI,NA,NA,NA,NA,NA,NA,bad_value,NA
 1,1521553,1522341,CHI,CHI,NA,NA,NA,NA,NA,NA,A,S
 2,1523290,1524546,CHI,CHI,NA,NA,NA,NA,NA,NA,V,C
 3,1528034,1530395,CHI,CHI,NA,NA,NA,NA,NA,NA,V,C