SciSpacy plugin compatibility (#425)

* Added scispacy_model_name * Added scispacy_model_name * version bump for debugging * unnecessary docstring * added models and entityLinkers as enums * added linker cli shortcut -l * undo 2 extra slots and added 1 * cleanup * rolled back entirely * missed commenting one line * reintroduced plugin config * ran make py * rollled back version * temp change in version # * rolled back version * temp version bump * version reset to 0 * added model in onfig a version bump * rolled back version * added model in CLI * added -m * rolled back version
INCATools · Jan 10, 2023 · 5d39eb8 · 5d39eb8
1 parent ea1dac7
commit 5d39eb8
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 121 deletions.
diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py
@@ -1269,6 +1269,12 @@ def term_metadata(terms, predicates, reification: bool, output_type: str, output
     "-L",
     help="path to lexical index. This is recreated each time unless --no-recreate is passed",
 )
+@click.option(
+    "--model",
+    "-m",
+    required=False,
+    help="Name of trained model to use for annotation, e.g. 'en_ner_craft_md'.",
+)
 @click.option(
     "--exclude-tokens",
     "-x",
@@ -1285,6 +1291,7 @@ def annotate(
     matches_whole_text: bool,
     exclude_tokens: str,
     text_file: TextIO,
+    model: str,
     output_type: str,
 ):
     """
@@ -1330,6 +1337,11 @@ def annotate(
         if exclude_tokens:
             token_exclusion_list = get_exclusion_token_list(exclude_tokens)
             configuration.token_exclusion_list = token_exclusion_list
+        if model:
+            configuration.model = model
+        # if plugin_config:
+        #     with open(plugin_config, "r") as p:
+        #         configuration.plugin_configuration = yaml.safe_load(p)
         if words and text_file:
             raise ValueError("Specify EITHER text-file OR a list of words as arguments")
         if text_file:

diff --git a/src/oaklib/datamodels/text_annotator.py b/src/oaklib/datamodels/text_annotator.py
@@ -1,5 +1,5 @@
 # Auto generated from text_annotator.yaml by pythongen.py version: 0.9.0
-# Generation date: 2022-11-16T17:09:19
+# Generation date: 2023-01-03T13:27:29
 # Schema: text-annotator
 #
 # id: https://w3id.org/linkml/text_annotator
@@ -102,6 +102,7 @@ class TextAnnotationConfiguration(YAMLRoot):
     sources: Optional[Union[str, List[str]]] = empty_list()
     limit: Optional[int] = None
     token_exclusion_list: Optional[Union[str, List[str]]] = empty_list()
+    model: Optional[str] = None
 
     def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
         if self.matches_whole_text is not None and not isinstance(self.matches_whole_text, Bool):
@@ -122,6 +123,9 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
             v if isinstance(v, str) else str(v) for v in self.token_exclusion_list
         ]
 
+        if self.model is not None and not isinstance(self.model, str):
+            self.model = str(self.model)
+
         super().__post_init__(**kwargs)
 
 
@@ -382,6 +386,15 @@ class slots:
     range=Optional[Union[str, List[str]]],
 )
 
+slots.textAnnotationConfiguration__model = Slot(
+    uri=ANN.model,
+    name="textAnnotationConfiguration__model",
+    curie=ANN.curie("model"),
+    model_uri=ANN.textAnnotationConfiguration__model,
+    domain=None,
+    range=Optional[str],
+)
+
 slots.textAnnotationResultSet__annotations = Slot(
     uri=ANN.annotations,
     name="textAnnotationResultSet__annotations",

diff --git a/src/oaklib/datamodels/text_annotator.schema.json b/src/oaklib/datamodels/text_annotator.schema.json
@@ -1,119 +1 @@
-{
-   "$defs": {
-      "TextAnnotation": {
-         "additionalProperties": false,
-         "description": "An individual text annotation",
-         "properties": {
-            "confidence": {
-               "type": "number"
-            },
-            "info": {
-               "type": "string"
-            },
-            "is_longest_match": {
-               "type": "boolean"
-            },
-            "match_string": {
-               "type": "string"
-            },
-            "match_type": {
-               "type": "string"
-            },
-            "object_id": {
-               "type": "string"
-            },
-            "object_label": {
-               "type": "string"
-            },
-            "object_source": {
-               "type": "string"
-            },
-            "predicate_id": {
-               "type": "string"
-            },
-            "subject_end": {
-               "type": "integer"
-            },
-            "subject_label": {
-               "description": "The portion of the subject text that is matched, ranging from subject_start to subject_end",
-               "type": "string"
-            },
-            "subject_source": {
-               "type": "string"
-            },
-            "subject_start": {
-               "type": "integer"
-            },
-            "subject_text_id": {
-               "type": "string"
-            }
-         },
-         "required": [],
-         "title": "TextAnnotation",
-         "type": "object"
-      },
-      "TextAnnotationResultSet": {
-         "additionalProperties": false,
-         "description": "A collection of annotation results",
-         "properties": {
-            "annotations": {
-               "description": "all annotations",
-               "items": {
-                  "$ref": "#/$defs/TextAnnotation"
-               },
-               "type": "array"
-            }
-         },
-         "required": [],
-         "title": "TextAnnotationResultSet",
-         "type": "object"
-      },
-      "TextualElement": {
-         "additionalProperties": false,
-         "description": "",
-         "properties": {
-            "id": {
-               "type": "string"
-            },
-            "parent_document": {
-               "type": "string"
-            },
-            "source_text": {
-               "type": "string"
-            },
-            "text": {
-               "type": "string"
-            }
-         },
-         "required": [
-            "id"
-         ],
-         "title": "TextualElement",
-         "type": "object"
-      },
-      "TransformationType": {
-         "description": "A controlled datamodels of the types of transformation that can be applied to",
-         "enum": [
-            "Stemming",
-            "Lemmatization",
-            "WordOrderNormalization",
-            "Depluralization",
-            "CaseNormalization",
-            "WhitespaceNormalization",
-            "TermExpanson"
-         ],
-         "title": "TransformationType",
-         "type": "string"
-      }
-   },
-   "$id": "https://w3id.org/linkml/text_annotator",
-   "$schema": "http://json-schema.org/draft-07/schema#",
-   "additionalProperties": true,
-   "metamodel_version": "1.7.0",
-   "properties": {},
-   "required": [],
-   "title": "text-annotator",
-   "type": "object",
-   "version": null
-}
-
+{"$defs": {"TextAnnotation": {"additionalProperties": false, "description": "An individual text annotation", "properties": {"confidence": {"type": "number"}, "info": {"type": "string"}, "is_longest_match": {"type": "boolean"}, "match_string": {"type": "string"}, "match_type": {"type": "string"}, "matches_whole_text": {"type": "boolean"}, "object_id": {"type": "string"}, "object_label": {"type": "string"}, "object_source": {"type": "string"}, "predicate_id": {"type": "string"}, "subject_end": {"type": "integer"}, "subject_label": {"description": "The portion of the subject text that is matched, ranging from subject_start to subject_end", "type": "string"}, "subject_source": {"type": "string"}, "subject_start": {"type": "integer"}, "subject_text_id": {"type": "string"}}, "title": "TextAnnotation", "type": "object"}, "TextAnnotationConfiguration": {"additionalProperties": false, "description": "configuration for search", "properties": {"limit": {"type": "integer"}, "matches_whole_text": {"type": "boolean"}, "model": {"type": "string"}, "sources": {"items": {"type": "string"}, "type": "array"}, "token_exclusion_list": {"items": {"type": "string"}, "type": "array"}}, "title": "TextAnnotationConfiguration", "type": "object"}, "TextAnnotationResultSet": {"additionalProperties": false, "description": "A collection of annotation results", "properties": {"annotations": {"description": "all annotations", "items": {"$ref": "#/$defs/TextAnnotation"}, "type": "array"}}, "title": "TextAnnotationResultSet", "type": "object"}, "TextualElement": {"additionalProperties": false, "description": "", "properties": {"id": {"type": "string"}, "parent_document": {"type": "string"}, "source_text": {"type": "string"}, "text": {"type": "string"}}, "required": ["id"], "title": "TextualElement", "type": "object"}, "TransformationType": {"description": "A controlled datamodels of the types of transformation that can be applied to", "enum": ["Stemming", "Lemmatization", "WordOrderNormalization", "Depluralization", "CaseNormalization", "WhitespaceNormalization", "TermExpanson"], "title": "TransformationType", "type": "string"}}, "$id": "https://w3id.org/linkml/text_annotator", "$schema": "http://json-schema.org/draft-07/schema#", "additionalProperties": true, "metamodel_version": "1.7.0", "title": "text-annotator", "type": "object", "version": null}
diff --git a/src/oaklib/datamodels/text_annotator.yaml b/src/oaklib/datamodels/text_annotator.yaml
@@ -55,6 +55,8 @@ classes:
         range: integer
       token_exclusion_list:
         multivalued: true
+      model:
+        range: string
 
   TextAnnotationResultSet:
     description: A collection of annotation results

diff --git a/src/oaklib/interfaces/text_annotator_interface.py b/src/oaklib/interfaces/text_annotator_interface.py
@@ -129,7 +129,6 @@ def annotate_file(
         """Annotate text in a file.
 
         :param text_file: Text file that is iterated line-by-line.
-        :param token_exclusion_list: List of tokens to exclude.
         :param configuration: Text annotation configuration, defaults to None.
         :yield: Annotation of each line.
         """