Skip to content

Commit

Permalink
Merge pull request #247 from JohnSnowLabs/release/514
Browse files Browse the repository at this point in the history
Release/514
  • Loading branch information
C-K-Loan committed Feb 8, 2024
2 parents 6523f6c + 2be6aae commit 6f6691a
Show file tree
Hide file tree
Showing 14 changed files with 99 additions and 13 deletions.
3 changes: 1 addition & 2 deletions nlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
__version__ = '5.1.3'

__version__ = '5.1.4'


import nlu.utils.environment.env_utils as env_utils
Expand Down
Empty file.
18 changes: 18 additions & 0 deletions nlu/components/classifiers/span_medical/span_medical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
class SpanMedical:
@staticmethod
def get_default_model():
from sparknlp_jsl.annotator import MedicalQuestionAnswering

return MedicalQuestionAnswering.pretrained() \
.setInputCols(["document_question", "context"]) \
.setOutputCol("answer")



@staticmethod
def get_pretrained_model(name, language, bucket=None):
from sparknlp_jsl.annotator import MedicalQuestionAnswering

return MedicalQuestionAnswering.pretrained(name, language, bucket) \
.setInputCols(["document_question", "context"]) \
.setOutputCol("answer")
25 changes: 25 additions & 0 deletions nlu/pipe/col_substitution/col_substitution_HC.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,28 @@ def substitute_generic_classifier_parser_cols(c, cols, is_unique=True, nlu_ident
logger.info(f'Dropping unmatched metadata_col={col} for c={c}')
# new_cols[col]= f"{new_base_name}_confidence"
return new_cols
def substitute_hc_span_classifier_cols(c, cols, nlu_identifier=True):
"""
QA classifier
"""
new_cols = {}
#new_base_name = 'answer' if nlu_identifier == 'UNIQUE' else f'{nlu_identifier}_answer'
new_base_name = 'answer'
for col in cols:
if 'answer_results' in col:
new_cols[col] = f'{new_base_name}'
if 'answer_results_score' in col:
new_cols[col] = f'{new_base_name}_confidence'

elif 'span_start_score' in col:
new_cols[col] = f'{new_base_name}_start_confidence'
elif 'span_end_score' in col:
new_cols[col] = f'{new_base_name}_end_confidence'
elif 'start' in col and not 'score' in col:
new_cols[col] = f'{new_base_name}_start'
elif 'end' in col and not 'score' in col:
new_cols[col] = f'{new_base_name}_end'
elif 'sentence' in col:
new_cols[col] = f'{new_base_name}_sentence'

return new_cols
2 changes: 2 additions & 0 deletions nlu/pipe/nlu_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(self,
requires_image_format: bool = False, # Set to true for OCR annotators that require image format
is_visual_annotator: bool = False, # Set to true for OCR annotators that require image format
is_light_pipe_incompatible: bool = False, # Set to true for OCR annotators that require image format
prefer_light_pipe: bool = False, # Set True for annos that should run in light pipe
):
self.name = name
self.type = type
Expand Down Expand Up @@ -118,6 +119,7 @@ def __init__(self,
self.requires_image_format = requires_image_format
self.is_visual_annotator = is_visual_annotator
self.is_light_pipe_incompatible = is_light_pipe_incompatible
self.prefer_light_pipe = prefer_light_pipe

def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
nlu_ref: str,
Expand Down
14 changes: 11 additions & 3 deletions nlu/pipe/utils/data_conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,25 @@
from pyspark.sql.types import StringType, StructType, StructField


class NluDataParseException(Exception):
"""Custom exception class"""

def __init__(self, message="An error occurred parsing data with NLU"):
self.message = message
super().__init__(self.message)

class DataConversionUtils:
# Modin aswell but optional, so we dont import the type yet
supported_types = [pyspark.sql.DataFrame, pd.DataFrame, pd.Series, np.ndarray]

@staticmethod
def except_text_col_not_found(cols):
raise ValueError(
raise NluDataParseException(
f'Could not find column named "text" in input Pandas Dataframe. Please ensure one column named such exists. Columns in DF are : {cols} ')

@staticmethod
def except_invalid_question_data_format(cols):
raise ValueError(
raise NluDataParseException(
f'You input data format is invalid for question answering with span classification.'
f'Make sure you have at least 2 columns in you dataset, named context/question for pandas Dataframes'
f'For Strings/Iterables/Tuples make sure to use the format `question|||context` or (question,context) ')
Expand Down Expand Up @@ -301,7 +308,6 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is
# TODO invalid Table Data Format Exception
pass
if isinstance(data[0], str):

return DataConversionUtils.table_question_str_to_sdf(data, spark_sess)
if isinstance(data[0], pd.DataFrame):
return DataConversionUtils.table_question_pdf_to_sdf(data, spark_sess)
Expand All @@ -321,6 +327,8 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is
return DataConversionUtils.question_tuple_iterable_to_sdf(data, spark_sess)
elif isinstance(data[0], str):
return DataConversionUtils.question_str_iterable_to_sdf(data, spark_sess)
except NluDataParseException as err :
raise err
except:
ValueError("Data could not be converted to Spark Dataframe for internal conversion.")
else:
Expand Down
3 changes: 3 additions & 0 deletions nlu/pipe/utils/pipe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,9 @@ def add_metadata_to_pipe(pipe: NLUPipeline):

for c in pipe.components:
# Check for OCR componments
if c.prefer_light_pipe:
pipe.prefer_light = True

if c.jsl_anno_py_class in py_class_to_anno_id.keys() or c.is_visual_annotator:
pipe.contains_ocr_components = True
if c.requires_image_format:
Expand Down
8 changes: 4 additions & 4 deletions nlu/pipe/utils/predict_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sparknlp.common import AnnotatorType

from nlu.pipe.utils.audio_data_conversion_utils import AudioDataConversionUtils
from nlu.pipe.utils.data_conversion_utils import DataConversionUtils
from nlu.pipe.utils.data_conversion_utils import DataConversionUtils, NluDataParseException
from nlu.pipe.utils.ocr_data_conversion_utils import OcrDataConversionUtils

logger = logging.getLogger('nlu')
Expand Down Expand Up @@ -211,8 +211,6 @@ def __db_endpoint_predict__(pipe, data):
1) parse pred params from first row maybe
2) serialize/deserialize img
"""
print("CUSOTM NLU MODE!")
print(data.columns)
params = PredictParams.maybe_from_pandas_df(data)
if params:
params = params.dict()
Expand Down Expand Up @@ -366,12 +364,14 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
try:
return __predict_standard_spark(pipe, data, output_level, positions, keep_stranger_features, metadata,
drop_irrelevant_cols, return_spark_df, get_embeddings)
except NluDataParseException as err:
logger.warning(f"Predictions Failed={err}")
raise err
except Exception as err:
logger.warning(f"Predictions Failed={err}")
pipe.print_exception_err(err)
raise Exception("Failure to process data with NLU")


def debug_print_pipe_cols(pipe):
for c in pipe.components:
print(f'{c.spark_input_column_names}->{c.name}->{c.spark_output_column_names}')
7 changes: 6 additions & 1 deletion nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -10598,7 +10598,7 @@ class Spellbook:
'de.deid.pipeline': 'german_deid_pipeline_spark24',
'de.med_ner.deid_generic.pipeline': 'ner_deid_generic_pipeline'},
'en': {

'en.answer_question.clinical_notes_onnx.pipeline': 'clinical_notes_qa_base_onnx_pipeline',
'en.classify.bert_sequence.binary_rct_biobert.pipeline': 'bert_sequence_classifier_binary_rct_biobert_pipeline',
'en.classify.bert_sequence.vop_hcp_consult.pipeline': 'bert_sequence_classifier_vop_hcp_consult_pipeline',
'en.classify.bert_sequence.vop_drug_side_effect.pipeline': 'bert_sequence_classifier_vop_drug_side_effect_pipeline',
Expand Down Expand Up @@ -10634,6 +10634,7 @@ class Spellbook:
'en.explain_doc.clinical_ade': 'explain_clinical_doc_ade',
'en.explain_doc.clinical_radiology.pipeline': 'explain_clinical_doc_radiology',
'en.explain_doc.era': 'explain_clinical_doc_era',
'en.explain_doc.clinical_granular': 'explain_clinical_doc_granular',
'en.icd10_icd9.mapping': 'icd10_icd9_mapping',
'en.icd10cm.umls.mapping': 'icd10cm_umls_mapping',
'en.icd10cm_resolver.pipeline': 'icd10cm_resolver_pipeline',
Expand Down Expand Up @@ -10761,10 +10762,12 @@ class Spellbook:
'en.resolve.medication': 'medication_resolver_pipeline',
'en.resolve.medication_transform.pipeline': 'medication_resolver_transform_pipeline',
'en.rxnorm.umls.mapping': 'rxnorm_umls_mapping',
'en.rxnorm.mes.mapping': 'rxnorm_mesh_mapping',
'en.snomed.umls.mapping': 'snomed_umls_mapping',
'en.spell.clinical.pipeline': 'spellcheck_clinical_pipeline',
'en.summarize.biomedical_pubmed.pipeline':'summarizer_biomedical_pubmed_pipeline',
'en.summarize.clinical_guidelines_large.pipeline': 'summarizer_clinical_guidelines_large_pipeline',
'en.summarize.clinical_laymen_onnx.pipeline': 'summarizer_clinical_laymen_onnx_pipeline',
'en.summarize.clinical_jsl_augmented.pipeline': 'summarizer_clinical_jsl_augmented_pipeline',
'en.summarize.clinical_questions.pipeline': 'summarizer_clinical_questions_pipeline',
'en.summarize.generic_jsl.pipeline': 'summarizer_generic_jsl_pipeline',
Expand Down Expand Up @@ -11105,6 +11108,7 @@ class Spellbook:
'en.med_ner.tumour': 'nerdl_tumour_demo',
'en.med_ner.vop': 'ner_vop',
'en.med_ner.vop_emb_clinical_large': 'ner_vop_emb_clinical_large',
'en.med_ner.vop_langtest': 'ner_vop_langtest',
'en.mesh_to_umls': 'mesh_umls_mapper',
'en.ner.clinical_trials_abstracts': 'ner_clinical_trials_abstracts',
'en.ner.drug_development_trials': 'bert_token_classifier_drug_development_trials',
Expand Down Expand Up @@ -16830,6 +16834,7 @@ class Spellbook:
'nerdl_tumour_demo': 'MedicalNerModel',
'ner_vop': 'MedicalNerModel',
'ner_vop_emb_clinical_large': 'MedicalNerModel',
'ner_vop_langtest': 'MedicalNerModel',
'ngram': 'NGramGenerator',
'nl': 'RoBertaEmbeddings',
'nli_mpnet_base_v2': 'MPNetEmbeddings',
Expand Down
3 changes: 2 additions & 1 deletion nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class AnnoClassRef:
JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {

A_N.E5_SENTENCE_EMBEDDINGS: 'E5Embeddings',
A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS:'InstructorEmbeddings',
A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS: 'InstructorEmbeddings',

A_N.WHISPER_FOR_CTC: 'WhisperForCTC',
A_N.HUBERT_FOR_CTC: 'HubertForCTC',
Expand Down Expand Up @@ -240,6 +240,7 @@ class AnnoClassRef:

}
JSL_anno_HC_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
HC_A_N.MEDICAL_QUESTION_ANSWERING: 'MedicalQuestionAnswering',
HC_A_N.MEDICAL_TEXT_GENERATOR: 'MedicalTextGenerator',
HC_A_N.MEDICAL_SUMMARIZER:'MedicalSummarizer',
HC_A_N.ZERO_SHOT_NER: 'ZeroShotNerModel',
Expand Down
23 changes: 23 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from nlu.components.classifiers.span_longformer.span_longformer import SpanLongFormerClassifier
from nlu.components.classifiers.span_roberta.span_roberta import SpanRobertaClassifier
from nlu.components.classifiers.span_xlm_roberta.span_xlm_roberta import SpanXlmRobertaClassifier
from nlu.components.classifiers.span_medical.span_medical import SpanMedical
from nlu.components.classifiers.token_albert.token_albert import TokenAlbert
from nlu.components.classifiers.token_bert.token_bert import TokenBert
from nlu.components.classifiers.token_bert_healthcare.token_bert_healthcare import TokenBertHealthcare
Expand Down Expand Up @@ -3278,6 +3279,27 @@ class ComponentUniverse:
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
),
H_A.MEDICAL_QUESTION_ANSWERING: partial(NluComponent,
name=H_A.MEDICAL_QUESTION_ANSWERING,
jsl_anno_class_id= H_A.MEDICAL_QUESTION_ANSWERING,
jsl_anno_py_class= ACR.JSL_anno_HC_ref_2_py_class[
H_A.MEDICAL_QUESTION_ANSWERING],
node= NLP_HC_FEATURE_NODES.nodes[
H_A.MEDICAL_QUESTION_ANSWERING],
get_default_model= SpanMedical.get_default_model,
get_pretrained_model= SpanMedical.get_pretrained_model,
type= T.QUESTION_SPAN_CLASSIFIER,
pdf_extractor_methods={
'default': default_span_classifier_config,
'default_full': default_full_span_classifier_config, },
pdf_col_name_substitutor=substitute_hc_span_classifier_cols,
output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER,
description='TODO',
provider=ComponentBackends.hc,
license=Licenses.hc,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
),

A.MULTI_DOCUMENT_ASSEMBLER: partial(NluComponent,
name=A.MULTI_DOCUMENT_ASSEMBLER,
Expand Down Expand Up @@ -3880,6 +3902,7 @@ class ComponentUniverse:
),

H_A.CHUNK_MAPPER_MODEL: partial(NluComponent,
prefer_light_pipe=True,
name=H_A.CHUNK_MAPPER_MODEL,
type=T.CHUNK_MAPPER,
get_default_model=ChunkMapper.get_default_model,
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ class NLP_HC_NODE_IDS: # or Mode Node?
ENTITY_CHUNK_EMBEDDING = JslAnnoId('entity_chunk_embedding')
MEDICAL_SUMMARIZER = JslAnnoId('med_summarizer')
MEDICAL_TEXT_GENERATOR = JslAnnoId('med_text_generator')
MEDICAL_QUESTION_ANSWERING = JslAnnoId('med_question_answering')

class OCR_NODE_IDS:
"""All available Feature nodes in OCR
Expand Down
2 changes: 2 additions & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,8 @@ class NLP_HC_FEATURE_NODES():
H_F = NLP_HC_FEATURES
# HC Feature Nodes
nodes = {
A.MEDICAL_QUESTION_ANSWERING: NlpFeatureNode(A.MEDICAL_QUESTION_ANSWERING, [F.DOCUMENT_QUESTION, F.DOCUMENT_QUESTION_CONTEXT], [F.CLASSIFIED_SPAN]),

A.MEDICAL_TEXT_GENERATOR: NlpFeatureNode(A.MEDICAL_TEXT_GENERATOR, [F.DOCUMENT], [F.DOCUMENT_GENERATED]),

A.MEDICAL_SUMMARIZER: NlpFeatureNode(A.MEDICAL_SUMMARIZER, [F.DOCUMENT], [F.DOCUMENT_GENERATED]),
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@

name='nlu',

version='5.1.3',

version='5.1.4',

description='John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library.',

Expand Down

0 comments on commit 6f6691a

Please sign in to comment.