Using spacy to indentify UMLS terms. 

Spacy steps:
  1. Load: en_core_sci_lg
  2. Add to pipe: scispacy_linker
  3. Add to pipe: entity_ruler
     - Built with: filtered_umls_atoms

Training EntityLinker with UMLS 

In [1]:
import os
from scispacy.linking import EntityLinker
import spacy
from dotenv import load_dotenv
import pandas as pd
from tqdm.auto import tqdm
import pandas as pd


dir_downloads = os.path.abspath(f'{os.getcwd()}/../_downloads')

load_dotenv()
EntityLinker()

  from .autonotebook import tqdm as notebook_tqdm
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<scispacy.linking.EntityLinker at 0x103b4ef20>

In [17]:
nlp = spacy.load('en_core_sci_sm')
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True,
                                        "linker_name": "umls",
                                        "threshold":0.85,
                                        "filter_for_definitions": False,
                                        #"disabling": ["tagger", "parser", "attribute_ruler", "lemmatizer"]
                                        })
ruler = nlp.add_pipe("entity_ruler",before="tok2vec")

In [18]:
def create_umls_entity_ruler(ruler, df_cleaned_umls_terms):
    """A dummy docstring."""
    print("Creating UMLS entity ruler...")
    patterns = []
    for _, row in tqdm(df_cleaned_umls_terms.iterrows(), total=df_cleaned_umls_terms.shape[0]):
        patterns.append(
            {"label": row["CUI"], "pattern": row["STR"], "id":row["CUI"]})
    ruler.add_patterns(patterns)


In [12]:
def process_product_ingredient_groups(ingredients_df, nlp):
    """process_product_ingredient_groups."""
    print("Processing docs...")
    for ingredient in tqdm(ingredients_df.iterrows(),
                           total=ingredients_df.shape[0]):
        doc = nlp(
            str(ingredient[1]['display'])
        )
        code = ingredient[1]['code']
        term_type = "PARENT"
        entry_count = 0
        for entry in doc.ents:
            entry_count += 1
            if entry_count != 1:
                term_type="CHILD"

            if entry._.kb_ents:
                umls_cui = entry._.kb_ents[0][0]
                umls_term = entry.text
                data = {
                    'code':code,
                    'term_type':term_type,
                    'umls_cui':umls_cui,
                    'umls_term':umls_term.upper()
                }
                print(data)
           

In [19]:
#spacy.require_gpu()
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df_cleaned_umls_terms = pd.read_csv(f"{dir_downloads}/umls-data/filtered_umls_atoms.csv")

create_umls_entity_ruler(ruler, df_cleaned_umls_terms)

Creating UMLS entity ruler...


100%|██████████| 1329481/1329481 [00:18<00:00, 70761.17it/s]


In [13]:
ingredients_df = pd.read_csv(f'ingredients.csv')
process_product_ingredient_groups(ingredients_df, nlp)


Processing docs...


 12%|█▎        | 2/16 [00:00<00:00, 15.89it/s]

{'code': 13436424, 'term_type': 'PARENT', 'umls_cui': 'C0078479', 'umls_term': 'WHEY PROTEIN CONCENTRATE'}
{'code': 95603427, 'term_type': 'PARENT', 'umls_cui': 'C0071545', 'umls_term': 'POLYDEXTROSE'}
{'code': 23796462, 'term_type': 'PARENT', 'umls_cui': 'C2247329', 'umls_term': 'WAXY'}


 25%|██▌       | 4/16 [00:00<00:00, 17.89it/s]

{'code': 23604808, 'term_type': 'PARENT', 'umls_cui': 'C0439092', 'umls_term': 'LESS'}
{'code': 62290169, 'term_type': 'PARENT', 'umls_cui': 'C0003968', 'umls_term': 'ASCORBIC ACID'}


 44%|████▍     | 7/16 [00:00<00:00, 21.20it/s]

{'code': 79334008, 'term_type': 'PARENT', 'umls_cui': 'C0994472', 'umls_term': 'DL-ALPHA-TOCOPHERYL'}
{'code': 32383276, 'term_type': 'PARENT', 'umls_cui': 'C0036140', 'umls_term': 'SALT'}
{'code': 22670585, 'term_type': 'PARENT', 'umls_cui': 'C0010028', 'umls_term': 'CORN'}


 62%|██████▎   | 10/16 [00:00<00:00, 22.90it/s]

{'code': 46300735, 'term_type': 'PARENT', 'umls_cui': 'C0077046', 'umls_term': 'SUCRALOSE'}
{'code': 57140259, 'term_type': 'PARENT', 'umls_cui': 'C0772228', 'umls_term': 'ACESULFAME'}
{'code': 45237955, 'term_type': 'PARENT', 'umls_cui': 'C0039840', 'umls_term': 'THIAMINE'}


 81%|████████▏ | 13/16 [00:00<00:00, 24.25it/s]

{'code': 47457067, 'term_type': 'PARENT', 'umls_cui': 'C0035527', 'umls_term': 'RIBOFLAVIN'}
{'code': 25900849, 'term_type': 'PARENT', 'umls_cui': 'C0028027', 'umls_term': 'NIACINAMIDE'}
{'code': 10682853, 'term_type': 'PARENT', 'umls_cui': 'C0700496', 'umls_term': 'PYRIDOXINE HYDRO-CHLORIDE'}


100%|██████████| 16/16 [00:00<00:00, 23.47it/s]

{'code': 96524214, 'term_type': 'PARENT', 'umls_cui': 'C0016410', 'umls_term': 'FOLIC ACID'}
{'code': 36152277, 'term_type': 'PARENT', 'umls_cui': 'C0042845', 'umls_term': 'CYANOCOBALAMIN'}





In [20]:
ingredients_df = pd.read_csv(f'ingredients.csv')
process_product_ingredient_groups(ingredients_df, nlp)

Processing docs...


  6%|▋         | 1/16 [00:00<00:01,  8.37it/s]

{'code': 13436424, 'term_type': 'PARENT', 'umls_cui': 'C0078479', 'umls_term': 'WHEY PROTEIN'}
{'code': 13436424, 'term_type': 'CHILD', 'umls_cui': 'C0872912', 'umls_term': 'SOY LECITHIN'}
{'code': 95603427, 'term_type': 'PARENT', 'umls_cui': 'C0071545', 'umls_term': 'POLYDEXTROSE'}
{'code': 95603427, 'term_type': 'CHILD', 'umls_cui': 'C0007332', 'umls_term': 'CASEIN'}


 19%|█▉        | 3/16 [00:00<00:00, 13.70it/s]

{'code': 23796462, 'term_type': 'PARENT', 'umls_cui': 'C2247329', 'umls_term': 'WAXY'}
{'code': 23604808, 'term_type': 'CHILD', 'umls_cui': 'C2004457', 'umls_term': 'ARTIFICIAL'}
{'code': 23604808, 'term_type': 'CHILD', 'umls_cui': 'C0596585', 'umls_term': 'FLAVOR'}


 38%|███▊      | 6/16 [00:00<00:00, 18.18it/s]

{'code': 79334008, 'term_type': 'PARENT', 'umls_cui': 'C0000975', 'umls_term': 'ACETATE'}
{'code': 32383276, 'term_type': 'PARENT', 'umls_cui': 'C0036140', 'umls_term': 'SALT'}


 56%|█████▋    | 9/16 [00:00<00:00, 20.17it/s]

{'code': 22670585, 'term_type': 'PARENT', 'umls_cui': 'C0010028', 'umls_term': 'CORN'}
{'code': 46300735, 'term_type': 'PARENT', 'umls_cui': 'C0077046', 'umls_term': 'SUCRALOSE'}
{'code': 57140259, 'term_type': 'PARENT', 'umls_cui': 'C0772228', 'umls_term': 'ACESULFAME'}
{'code': 57140259, 'term_type': 'CHILD', 'umls_cui': 'C0032821', 'umls_term': 'POTASSIUM'}


 75%|███████▌  | 12/16 [00:00<00:00, 21.84it/s]

{'code': 45237955, 'term_type': 'PARENT', 'umls_cui': 'C0770309', 'umls_term': 'THIAMINE HYDROCHLORIDE'}
{'code': 47457067, 'term_type': 'PARENT', 'umls_cui': 'C0035527', 'umls_term': 'RIBOFLAVIN'}
{'code': 25900849, 'term_type': 'PARENT', 'umls_cui': 'C0028027', 'umls_term': 'NIACINAMIDE'}


100%|██████████| 16/16 [00:00<00:00, 20.69it/s]

{'code': 10682853, 'term_type': 'PARENT', 'umls_cui': 'C0034272', 'umls_term': 'PYRIDOXINE'}
{'code': 96524214, 'term_type': 'PARENT', 'umls_cui': 'C0016410', 'umls_term': 'FOLIC ACID'}
{'code': 36152277, 'term_type': 'PARENT', 'umls_cui': 'C0042845', 'umls_term': 'CYANOCOBALAMIN'}



