Skip to content

Commit

Permalink
SciSpacy plugin compatibility (#425)
Browse files Browse the repository at this point in the history
* Added scispacy_model_name

* Added scispacy_model_name

* version bump for debugging

* unnecessary docstring

* added models and entityLinkers as enums

* added linker cli shortcut -l

* undo 2 extra slots and added 1

* cleanup

* rolled back entirely

* missed commenting one line

* reintroduced plugin config

* ran make py

* rollled back version

* temp change in version #

* rolled back version

* temp version bump

* version reset to 0

* added model in onfig a version bump

* rolled back version

* added model in CLI

* added -m

* rolled back version
  • Loading branch information
hrshdhgd committed Jan 10, 2023
1 parent ea1dac7 commit 5d39eb8
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 121 deletions.
12 changes: 12 additions & 0 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1269,6 +1269,12 @@ def term_metadata(terms, predicates, reification: bool, output_type: str, output
"-L",
help="path to lexical index. This is recreated each time unless --no-recreate is passed",
)
@click.option(
"--model",
"-m",
required=False,
help="Name of trained model to use for annotation, e.g. 'en_ner_craft_md'.",
)
@click.option(
"--exclude-tokens",
"-x",
Expand All @@ -1285,6 +1291,7 @@ def annotate(
matches_whole_text: bool,
exclude_tokens: str,
text_file: TextIO,
model: str,
output_type: str,
):
"""
Expand Down Expand Up @@ -1330,6 +1337,11 @@ def annotate(
if exclude_tokens:
token_exclusion_list = get_exclusion_token_list(exclude_tokens)
configuration.token_exclusion_list = token_exclusion_list
if model:
configuration.model = model
# if plugin_config:
# with open(plugin_config, "r") as p:
# configuration.plugin_configuration = yaml.safe_load(p)
if words and text_file:
raise ValueError("Specify EITHER text-file OR a list of words as arguments")
if text_file:
Expand Down
15 changes: 14 additions & 1 deletion src/oaklib/datamodels/text_annotator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Auto generated from text_annotator.yaml by pythongen.py version: 0.9.0
# Generation date: 2022-11-16T17:09:19
# Generation date: 2023-01-03T13:27:29
# Schema: text-annotator
#
# id: https://w3id.org/linkml/text_annotator
Expand Down Expand Up @@ -102,6 +102,7 @@ class TextAnnotationConfiguration(YAMLRoot):
sources: Optional[Union[str, List[str]]] = empty_list()
limit: Optional[int] = None
token_exclusion_list: Optional[Union[str, List[str]]] = empty_list()
model: Optional[str] = None

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.matches_whole_text is not None and not isinstance(self.matches_whole_text, Bool):
Expand All @@ -122,6 +123,9 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
v if isinstance(v, str) else str(v) for v in self.token_exclusion_list
]

if self.model is not None and not isinstance(self.model, str):
self.model = str(self.model)

super().__post_init__(**kwargs)


Expand Down Expand Up @@ -382,6 +386,15 @@ class slots:
range=Optional[Union[str, List[str]]],
)

slots.textAnnotationConfiguration__model = Slot(
uri=ANN.model,
name="textAnnotationConfiguration__model",
curie=ANN.curie("model"),
model_uri=ANN.textAnnotationConfiguration__model,
domain=None,
range=Optional[str],
)

slots.textAnnotationResultSet__annotations = Slot(
uri=ANN.annotations,
name="textAnnotationResultSet__annotations",
Expand Down
120 changes: 1 addition & 119 deletions src/oaklib/datamodels/text_annotator.schema.json
Original file line number Diff line number Diff line change
@@ -1,119 +1 @@
{
"$defs": {
"TextAnnotation": {
"additionalProperties": false,
"description": "An individual text annotation",
"properties": {
"confidence": {
"type": "number"
},
"info": {
"type": "string"
},
"is_longest_match": {
"type": "boolean"
},
"match_string": {
"type": "string"
},
"match_type": {
"type": "string"
},
"object_id": {
"type": "string"
},
"object_label": {
"type": "string"
},
"object_source": {
"type": "string"
},
"predicate_id": {
"type": "string"
},
"subject_end": {
"type": "integer"
},
"subject_label": {
"description": "The portion of the subject text that is matched, ranging from subject_start to subject_end",
"type": "string"
},
"subject_source": {
"type": "string"
},
"subject_start": {
"type": "integer"
},
"subject_text_id": {
"type": "string"
}
},
"required": [],
"title": "TextAnnotation",
"type": "object"
},
"TextAnnotationResultSet": {
"additionalProperties": false,
"description": "A collection of annotation results",
"properties": {
"annotations": {
"description": "all annotations",
"items": {
"$ref": "#/$defs/TextAnnotation"
},
"type": "array"
}
},
"required": [],
"title": "TextAnnotationResultSet",
"type": "object"
},
"TextualElement": {
"additionalProperties": false,
"description": "",
"properties": {
"id": {
"type": "string"
},
"parent_document": {
"type": "string"
},
"source_text": {
"type": "string"
},
"text": {
"type": "string"
}
},
"required": [
"id"
],
"title": "TextualElement",
"type": "object"
},
"TransformationType": {
"description": "A controlled datamodels of the types of transformation that can be applied to",
"enum": [
"Stemming",
"Lemmatization",
"WordOrderNormalization",
"Depluralization",
"CaseNormalization",
"WhitespaceNormalization",
"TermExpanson"
],
"title": "TransformationType",
"type": "string"
}
},
"$id": "https://w3id.org/linkml/text_annotator",
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": true,
"metamodel_version": "1.7.0",
"properties": {},
"required": [],
"title": "text-annotator",
"type": "object",
"version": null
}

{"$defs": {"TextAnnotation": {"additionalProperties": false, "description": "An individual text annotation", "properties": {"confidence": {"type": "number"}, "info": {"type": "string"}, "is_longest_match": {"type": "boolean"}, "match_string": {"type": "string"}, "match_type": {"type": "string"}, "matches_whole_text": {"type": "boolean"}, "object_id": {"type": "string"}, "object_label": {"type": "string"}, "object_source": {"type": "string"}, "predicate_id": {"type": "string"}, "subject_end": {"type": "integer"}, "subject_label": {"description": "The portion of the subject text that is matched, ranging from subject_start to subject_end", "type": "string"}, "subject_source": {"type": "string"}, "subject_start": {"type": "integer"}, "subject_text_id": {"type": "string"}}, "title": "TextAnnotation", "type": "object"}, "TextAnnotationConfiguration": {"additionalProperties": false, "description": "configuration for search", "properties": {"limit": {"type": "integer"}, "matches_whole_text": {"type": "boolean"}, "model": {"type": "string"}, "sources": {"items": {"type": "string"}, "type": "array"}, "token_exclusion_list": {"items": {"type": "string"}, "type": "array"}}, "title": "TextAnnotationConfiguration", "type": "object"}, "TextAnnotationResultSet": {"additionalProperties": false, "description": "A collection of annotation results", "properties": {"annotations": {"description": "all annotations", "items": {"$ref": "#/$defs/TextAnnotation"}, "type": "array"}}, "title": "TextAnnotationResultSet", "type": "object"}, "TextualElement": {"additionalProperties": false, "description": "", "properties": {"id": {"type": "string"}, "parent_document": {"type": "string"}, "source_text": {"type": "string"}, "text": {"type": "string"}}, "required": ["id"], "title": "TextualElement", "type": "object"}, "TransformationType": {"description": "A controlled datamodels of the types of transformation that can be applied to", "enum": ["Stemming", "Lemmatization", "WordOrderNormalization", "Depluralization", "CaseNormalization", "WhitespaceNormalization", "TermExpanson"], "title": "TransformationType", "type": "string"}}, "$id": "https://w3id.org/linkml/text_annotator", "$schema": "http://json-schema.org/draft-07/schema#", "additionalProperties": true, "metamodel_version": "1.7.0", "title": "text-annotator", "type": "object", "version": null}
2 changes: 2 additions & 0 deletions src/oaklib/datamodels/text_annotator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ classes:
range: integer
token_exclusion_list:
multivalued: true
model:
range: string

TextAnnotationResultSet:
description: A collection of annotation results
Expand Down
1 change: 0 additions & 1 deletion src/oaklib/interfaces/text_annotator_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ def annotate_file(
"""Annotate text in a file.
:param text_file: Text file that is iterated line-by-line.
:param token_exclusion_list: List of tokens to exclude.
:param configuration: Text annotation configuration, defaults to None.
:yield: Annotation of each line.
"""
Expand Down

0 comments on commit 5d39eb8

Please sign in to comment.