Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Eaf controlled vocabulary #362

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions ChildProject/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ def __init__(self, project: ChildProject):
"""
self.project = project
self.annotations = None
self.controlled_voc = pd.DataFrame()
self.errors = []

if not isinstance(project, ChildProject):
Expand Down Expand Up @@ -309,6 +310,10 @@ def read(self) -> Tuple[List[str], List[str]]:
]
)

controlled_voc_path = os.path.join(self.project.path, "metadata/controlled_vocabulary.csv")
if os.path.exists(controlled_voc_path):
self.controlled_voc = pd.read_csv(controlled_voc_path, index_col=0)

return errors, warnings

def validate_annotation(self, annotation: dict) -> Tuple[List[str], List[str]]:
Expand Down Expand Up @@ -385,6 +390,16 @@ def write(self):
os.path.join(self.project.path, "metadata/annotations.csv"), index=False
)

def write_controlled_vocabulary(self, imported_controlled_vocabulary: pd.DataFrame):
"""
Update the controlled vocabulary of EAF files
"""
self.controlled_voc = pd.concat([self.controlled_voc, imported_controlled_vocabulary])
self.controlled_voc = self.controlled_voc.astype(str).drop_duplicates()
self.controlled_voc.to_csv(
os.path.join(self.project.path, "metadata/controlled_vocabulary.csv"), index=True
)

def _import_annotation(
self, import_function: Callable[[str], pd.DataFrame], params: dict, annotation: dict
):
Expand Down Expand Up @@ -480,6 +495,10 @@ def _import_annotation(
)
annotation["package_version"] = __version__

if annotation_format == "eaf":
controlled_voc = EafConverter.get_controlled_vocabulary(path)
self.write_controlled_vocabulary(controlled_voc)

if pd.isnull(annotation["format"]):
annotation["format"] = "NA"

Expand Down
40 changes: 37 additions & 3 deletions ChildProject/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,30 @@ class EafConverter(AnnotationConverter):
FORMAT = "eaf"

@staticmethod
def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
def get_controlled_vocabulary(filename: str) -> pd.DataFrame:
import pympi

eaf = pympi.Elan.Eaf(filename)

controlled_voc = pd.DataFrame.from_dict(
eaf.controlled_vocabularies,
orient='index',
columns=['description', 'entries', 'external_references']
)

controlled_voc['description'] = \
controlled_voc['description'].transform(lambda x: x[0][1] if len(x) > 0 else x)
controlled_voc['entries'] = \
controlled_voc['entries'] \
.transform(
lambda x: [value[0][0] if len(value) > 0 else value for value in x.values()]
)

return controlled_voc


@staticmethod
def convert(filename: str, filter: str=None, new_tiers: list=None, **kwargs) -> pd.DataFrame:
import pympi

eaf = pympi.Elan.Eaf(filename)
Expand Down Expand Up @@ -523,8 +546,19 @@ def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
segment["vcm_type"] = value
elif label == "msc":
segment["msc_type"] = value
elif label in kwargs["new_tiers"]:
segment[label] = value
elif new_tiers is not None and label in new_tiers:
if label in eaf.controlled_vocabularies.keys():
controlled_values = \
[value[0][0][0] for value in eaf.controlled_vocabularies[label][1].values()]
if value not in controlled_values:
print(
f'warning: {value} is not in the controlled'
f'vocabulary {controlled_values} for {label}'
)
else:
segment[label] = value
else:
segment[label] = value

return pd.DataFrame(segments.values())

Expand Down
12 changes: 10 additions & 2 deletions docs/source/api-annotations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,17 @@ When importing EAF annotation files, some tiers are supported by ChildProject, s
If you want to import a tier that is not supported by ChildProject, you can use
:meth:`~ChildProject.annotations.AnnotationManager.import_annotations` as follows :

.. code-block:: python
..code-block:: python

>>> am.import_annotations(input, new_tiers = ['name_of_tier'])

If a controlled vocabulary is added in the EAF annotation file for this new tier, the values
of the annotations are checked. If a value is not in the controlled vocabulary, it is not
written in the annotation file, and a warning is thrown.
Moreover, the ``metadata/controlled_vocabulary.csv`` dataframe in metadata is either created
with the available controlled vocabularies or updated with this new tier.

>>> am.import_annotations(input, new_tier = ['name_of_tier'])
If no controlled vocabulary is added in the EAF annoation file, the values are not checked.

Validating annotations
~~~~~~~~~~~~~~~~~~~~~~
Expand Down
31 changes: 2 additions & 29 deletions tests/data/eaf_any_tier.eaf
Original file line number Diff line number Diff line change
Expand Up @@ -1612,7 +1612,7 @@
<TIER LINGUISTIC_TYPE_REF="NEWTIER2" PARENT_REF="CHI" PARTICIPANT="CHI" TIER_ID="newtier2@CHI">
<ANNOTATION>
<REF_ANNOTATION ANNOTATION_ID="a257" ANNOTATION_REF="a256" CVE_REF="cveid_09a9bb98-31a9-4afd-9ed7-d4fc7af658a6">
<ANNOTATION_VALUE>W</ANNOTATION_VALUE>
<ANNOTATION_VALUE>bad_value</ANNOTATION_VALUE>
</REF_ANNOTATION>
</ANNOTATION>
<ANNOTATION>
Expand Down Expand Up @@ -2400,7 +2400,7 @@
<TIER LINGUISTIC_TYPE_REF="NEWTIER" PARENT_REF="CHI" PARTICIPANT="CHI" TIER_ID="newtier@CHI">
<ANNOTATION>
<REF_ANNOTATION ANNOTATION_ID="a582" ANNOTATION_REF="a256" CVE_REF="cveid_57c69cd9-b12d-49e9-a2ee-5782f3f1b867">
<ANNOTATION_VALUE>S</ANNOTATION_VALUE>
<ANNOTATION_VALUE>bad_value</ANNOTATION_VALUE>
</REF_ANNOTATION>
</ANNOTATION>
<ANNOTATION>
Expand Down Expand Up @@ -3324,33 +3324,6 @@
<CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
<CONSTRAINT
DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
<CONTROLLED_VOCABULARY CV_ID="newtier2">
<DESCRIPTION LANG_REF="und">Simplified subset of infant vocal maturity classes (distinguishing between variegated and non-variegated syllables)</DESCRIPTION>
<CV_ENTRY_ML CVE_ID="cveid_e7300257-f12a-479f-90f0-c2fefbf99a26">
<CVE_VALUE DESCRIPTION="Crying" LANG_REF="und">Y</CVE_VALUE>
</CV_ENTRY_ML>
<CV_ENTRY_ML CVE_ID="cveid_ae00bfde-d4bb-499e-8c63-81c4459f5b8a">
<CVE_VALUE DESCRIPTION="Laughing" LANG_REF="und">L</CVE_VALUE>
</CV_ENTRY_ML>
<CV_ENTRY_ML CVE_ID="cveid_df01bf24-04f4-4cff-9bc4-ca92a0ca945f">
<CVE_VALUE
DESCRIPTION="Non-canonical non-variegated syllable(s)" LANG_REF="und">A</CVE_VALUE>
</CV_ENTRY_ML>
<CV_ENTRY_ML CVE_ID="cveid_8675a2cf-bb35-476c-a602-8b911eb2a845">
<CVE_VALUE
DESCRIPTION="Non-canonical variegated syllable(s)" LANG_REF="und">P</CVE_VALUE>
</CV_ENTRY_ML>
<CV_ENTRY_ML CVE_ID="cveid_f1ad7cdd-4916-4914-a59a-a33d0d7052cc">
<CVE_VALUE DESCRIPTION="Canonical variegated syllable(s)" LANG_REF="und">V</CVE_VALUE>
</CV_ENTRY_ML>
<CV_ENTRY_ML CVE_ID="cveid_09a9bb98-31a9-4afd-9ed7-d4fc7af658a6">
<CVE_VALUE
DESCRIPTION="Canonical non-variegated syllable(s)" LANG_REF="und">W</CVE_VALUE>
</CV_ENTRY_ML>
<CV_ENTRY_ML CVE_ID="cveid_ee07af47-c822-4fb3-80d3-d842d80272b7">
<CVE_VALUE DESCRIPTION="Uncertain" LANG_REF="und">U</CVE_VALUE>
</CV_ENTRY_ML>
</CONTROLLED_VOCABULARY>
<CONTROLLED_VOCABULARY CV_ID="tocode">
<DESCRIPTION LANG_REF="und">"y" if coder thinks the clip should be coded, "n" if not</DESCRIPTION>
<CV_ENTRY_ML CVE_ID="cveid0">
Expand Down
2 changes: 1 addition & 1 deletion tests/truth/eaf_any_tier.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
,segment_onset,segment_offset,speaker_id,speaker_type,vcm_type,lex_type,mwu_type,addressee,transcription,words,newtier2,newtier
0,1519558,1519992,CHI,CHI,NA,NA,NA,NA,NA,NA,W,S
0,1519558,1519992,CHI,CHI,NA,NA,NA,NA,NA,NA,bad_value,NA
1,1521553,1522341,CHI,CHI,NA,NA,NA,NA,NA,NA,A,S
2,1523290,1524546,CHI,CHI,NA,NA,NA,NA,NA,NA,V,C
3,1528034,1530395,CHI,CHI,NA,NA,NA,NA,NA,NA,V,C
Expand Down