# MEDSTRUCT Context Extraction Notebook #

### Extracting clinical targets and context from free-text (Use case: Long Embolism) ###

Run the next block to import medstructopen

Load spaCy and pyContextNLP services

    docker-compose -f docker-compose-context.yml up

In [None]:
import sys
sys.path.append("./../")

Upload or move your excel file containing the medical reports to the /resource folder of this project and set the path here:

In [None]:
source_xlsx = "./../resources/data_example/longembolie-test.xlsx"

Read source excel file

In [None]:
import pandas
df = pandas.read_excel(source_xlsx, header=0)
result_df = df.copy()

Set column of report and language:

In [None]:
column_text = "report"
language = "nl"

from IPython.display import display, HTML
display(result_df)

Annotate context using spaCy and pyContextNLP nafflow microservices

In [None]:
spacy_http_url = 'http://spacy-json-nlp-medstruct-open:5001/token_list'
spacy_models = { 
    "en": "en_core_web_sm", 
    "nl": "nl_core_news_sm"}
pycontextnlp_language_map = {
    "en": 'http://pycontext-json-nlp-datau-en:5003/json-nlp',
    "nl": 'http://pycontext-json-nlp-datau-nl:5003/json-nlp'
    }

from nlp_datau.client.pycontextnlp_jsonnlp_client import PyContextNlpClient
from nlp_datau.client.spacy_jsonnlp_client import SpacyClient
spacy_client = SpacyClient(spacy_http_url, spacy_models)
pycontextnlp_client = PyContextNlpClient(pycontextnlp_language_map)

def get_context(index, text):
    print("report", index)
    json_nlp = spacy_client.annotate(text=text, identifier=str(index), lang=language, document_date=None)
    context = pycontextnlp_client.annotate(json_nlp, language)
    return context

result_df['context'] = result_df.apply(lambda row: get_context(row.name + 1, row[column_text])['documents'][0]['context'], axis=1)

Group modifiers into present and not present categories

In [None]:
modifiers_present = [ 'definite_existence', 'probable_existence', 'indication', 'ambivalent_existence']
modifiers_not_present = ['definite_negated_existence', 'probable_negated_existence', 'pseudoneg', 'historical', 'limited_amount']

Set target concepts to extract (as defined in pycontext_targets.yml)

In [None]:
concepts = ['snomedct:59282003']

Extract targets

In [None]:
from nlp_datau.util.pycontextnlp_parser import ContextParser

for concept in concepts:
    result_df['target-'+concept] = result_df.apply(lambda row: ContextParser.filter_concept(row['context'], concept), axis=1)
display(result_df)    

Is target present (no modifiers found)

In [None]:
for concept in concepts:
    result_df['target-'+concept] = result_df.apply(lambda row: ContextParser.is_present(row['target-'+concept], modifiers_not_present), axis=1)
display(result_df) 

Set column which contains the label to evaluate classification (optional)

In [None]:
column_labels = ['label longembolie']

Evaluate context classification (optional)

In [None]:
for i, concept in enumerate(concepts):
    y_actu = pandas.Series(result_df[column_labels[i]], name='Actual')
    y_pred = pandas.Series(result_df['target-'+concept], name='Predicted')
    df_confusion = pandas.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
    display(df_confusion)

Write results to excel file

In [None]:
!{sys.executable} -m pip install openpyxl
results_xlsx = "./../resources/results/context_results.xlsx"
result_df.to_excel(results_xlsx)