In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/medical-ner/Corona2.json


In [2]:
import spacy

In [3]:
nlp=spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
doc=nlp("meta is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

meta | PERSON | People, including fictional
$45 billion | MONEY | Monetary values, including unit


In [5]:
from spacy import  displacy
displacy.render(doc,style="ent")

In [6]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [7]:
from spacy.tokens import Span
s1=Span(doc,0,1,label="ORG")
s2=Span(doc,5,6,label="ORG")
doc.set_ents([s1,s2],default="unmodified")

In [8]:
for ent in doc.ents:
    print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

meta | ORG | Companies, agencies, institutions, etc.
twitter | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [9]:
import json
with open('/kaggle/input/medical-ner/Corona2.json','r') as f:
    data= json.load(f)

In [10]:
data['examples'][0]

{'id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'content': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'metadata': {},
 'annotations': [{'id': '0825a1

In [11]:
data['examples'][0].keys()

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

In [12]:
data['examples'][0]['content']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [13]:
data['examples'][0]['annotations']

[{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
  'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
  'end': 371,
  'start': 360,
  'example_id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
  'tag_name': 'Medicine',
  'value': 'Diosmectite',
  'correct': None,
  'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z',
    'annotator_id': 1,
    'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
    'name': 'Ashpat123',
    'reason': 'exploration'}],
  'model_annotations': []},
 {'id': '145f62b1-bbf5-42f1-8ad5-9c7e08337bf0',
  'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
  'end': 408,
  'start': 383,
  'example_id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
  'tag_name': 'Medicine',
  'value': 'aluminomagnesium silicate',
  'correct': None,
  'human_annotations': [{'timestamp': '2020-03-21T00:24:43.692000Z',
    'annotator_id': 1,
    'tagged_token_id': '145f62b1-bbf5-42f1-8ad5-9c7e08337bf0',
    'name': 'Ashpat123',
    'reason': 'exploration'}],
  'model_annotations':

In [14]:
training_data=[]
for example in data['examples']:
    temp_dict={}
    temp_dict['text']=example['content']
    temp_dict['entities']=[]
    for annotations in example['annotations']:
        start=annotations['start']
        end=annotations['end']
        label=annotations['tag_name'].upper()
        temp_dict['entities'].append((start,end,label))
    training_data.append(temp_dict)
training_data[0]

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'entities': [(360, 371, 'MEDICINE'),
  (383, 408, 'MEDICINE'),
  (104, 112, 'MEDICALCONDITION'),


In [15]:
training_data[0]['text']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [16]:
training_data[0]['entities']

[(360, 371, 'MEDICINE'),
 (383, 408, 'MEDICINE'),
 (104, 112, 'MEDICALCONDITION'),
 (679, 689, 'MEDICINE'),
 (6, 23, 'MEDICINE'),
 (25, 37, 'MEDICINE'),
 (461, 470, 'MEDICALCONDITION'),
 (577, 589, 'MEDICINE'),
 (853, 865, 'MEDICALCONDITION'),
 (188, 198, 'MEDICINE'),
 (754, 762, 'MEDICALCONDITION'),
 (870, 880, 'MEDICALCONDITION'),
 (823, 833, 'MEDICINE'),
 (852, 853, 'MEDICALCONDITION'),
 (461, 469, 'MEDICALCONDITION'),
 (535, 543, 'MEDICALCONDITION'),
 (692, 704, 'MEDICINE'),
 (563, 571, 'MEDICALCONDITION')]

In [17]:
from spacy.tokens import DocBin
from tqdm import tqdm
nlp=spacy.blank("en")
doc_bin=DocBin()

In [18]:
from spacy.util import filter_spans
for training_example in tqdm(training_data):
    text=training_example['text']
    labels=training_example['entities']
    doc=nlp.make_doc(text)
    ents=[]
    for start, end, label in labels:
        span =doc.char_span(start,end,label=label,alignment_mode="contract")
        if span is None:
            print("skipping")
        else:
            ents.append(span)
    filtered_ents=filter_spans(ents)
    doc.ents=filtered_ents
    doc_bin.add(doc)
doc_bin.to_disk('train.spacy')

100%|██████████| 31/31 [00:00<00:00, 392.42it/s]

skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping





In [19]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [20]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy


[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2025-01-23 18:28:13,245] [INFO] Set up nlp object from config
[2025-01-23 18:28:13,270] [INFO] Pipeline: ['tok2vec', 'ner']
[2025-01-23 18:28:13,274] [INFO] Created vocabulary
[2025-01-23 18:28:13,275] [INFO] Finished initializing nlp object
[2025-01-23 18:28:13,714] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  d_xhat = N * dY - sum_dy - dist * var ** (-1.0) * sum_dy_dist
  0       0          0.00    153.29    0.00    0.00    0.00    0.00
  7     200        896.72   3337.63   81.51   82.33   80.71    0.82
 14     400        209.31    580.26   94.88   94.88   94.88    0

In [21]:
nlp_ner=spacy.load("output/model-last")

In [22]:
doc=nlp_ner("bismuth compunds reduced the number of bowel movements in diarrhea")
spacy.displacy.render(doc,style="ent")