In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [2]:
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")

In [3]:
model = AutoModelForTokenClassification.from_pretrained("obi/deid_roberta_i2b2")

In [4]:
sample="Physician Discharge Summary Admit date: 10/12/1982 Discharge date: 10/22/1982 Patient Information Jack Reacher, \
54 y.o. male (DOB = 1/21/1928). Home Address: 123 Park Drive, San Diego, CA, 03245. Home Phone: 202-555-0199 (home). Hospital Care \
Team Service: Orthopedics Inpatient Attending: Roger C Kelly, MD Attending phys phone: (634)743-5135 Discharge Unit: HCS843 Primary \
Care Physician: Hassan V Kim, MD 512-832-5025."

print(sample)

Physician Discharge Summary Admit date: 10/12/1982 Discharge date: 10/22/1982 Patient Information Jack Reacher, 54 y.o. male (DOB = 1/21/1928). Home Address: 123 Park Drive, San Diego, CA, 03245. Home Phone: 202-555-0199 (home). Hospital Care Team Service: Orthopedics Inpatient Attending: Roger C Kelly, MD Attending phys phone: (634)743-5135 Discharge Unit: HCS843 Primary Care Physician: Hassan V Kim, MD 512-832-5025.


In [5]:
from transformers import pipeline

In [6]:
classifier = pipeline("ner", model="obi/deid_roberta_i2b2")

In [7]:
outputs = classifier(sample)

In [8]:
def entity_name(entityvalue: str):
    return entityvalue[2:]

def process_enitity_val(entityvalue: str):
    spacechar = 'Ġ'
    if entityvalue[0] == spacechar:
        entityvalue = entityvalue[1:]
    entityvalue = entityvalue.replace(spacechar, ' ')
    return entityvalue

In [9]:
entities = []
entitytype = None
entitystart = -1
temp = ""
for idx, item in enumerate(outputs):
    if idx == 0:
        temp = item['word']
        entitytype = entity_name(item['entity'])
        entitystart = item['start']
        continue
    previtem = outputs[idx-1]
    currententity = entity_name(item['entity'])
    if (item['index'] == previtem['index'] + 1) and (currententity == entitytype):
        temp += item['word']
    else:
        entities.append((process_enitity_val(temp), entitytype, entitystart))
        temp = item['word']
        entitytype = entity_name(item['entity'])
        entitystart = item['start']

In [10]:
for e in entities:
    assert sample[e[2]] == e[0][0]
    print(e)

('10/12/1982', 'DATE', 40)
('10/22/1982', 'DATE', 67)
('Jack Reacher', 'PATIENT', 98)
('54', 'AGE', 112)
('1/21/1928', 'DATE', 132)
('123 Park Drive', 'LOC', 158)
('San Diego', 'LOC', 174)
('CA', 'LOC', 185)
('03245', 'LOC', 189)
('202-555-0199', 'PHONE', 208)
('Roger C Kelly', 'STAFF', 290)
('(634)743-5135', 'PHONE', 330)
('H', 'HOSP', 360)
('CS843', 'ID', 361)
('Hassan V Kim', 'STAFF', 391)


In [11]:
import pydicom

from os import listdir
from os.path import isfile, join
from pathlib import Path

import pandas as pd

In [12]:
def load_metadata(filename: str):
    return pd.read_csv(filename)

def load_dicoms_from_path(dicompath: str):
    alldicompaths = [f for f in listdir(dicompath) if isfile(join(dicompath, f))]
    alldicoms = []
    for dcm in alldicompaths:
        dcmpath = Path(dicompath, dcm)
        ds = pydicom.dcmread(dcmpath)
        alldicoms.append(ds)
    return alldicoms

def load_series_by_index(metadatafile: str, seriesidx: int):
    metadata = load_metadata(metadatafile)
    assert seriesidx < len(metadata), f"Patient index greater than available patient dicoms. Available patients {len(metadata)}"
    
    target_row = metadata.loc[seriesidx]
    targetdcmdir = Path(Path(metadatafile).parent, str(target_row['File Location']))
    alldicoms = load_dicoms_from_path(targetdcmdir)
    
    return alldicoms, target_row.to_dict()

In [13]:
metadatafile = '/home/r079a/Desktop/de-identification/dataset/images/manifest-1617826555824/metadata.csv'
sample_dicom_path = '/home/r079a/Desktop/de-identification/dataset/images/manifest-1617826555824/Pseudo-PHI-DICOM-Data/571403367/07-11-2019-NA-DBT Reconstructed Volume-37558/DBT slices-78838'
print(sample_dicom_path)

/home/r079a/Desktop/de-identification/dataset/images/manifest-1617826555824/Pseudo-PHI-DICOM-Data/571403367/07-11-2019-NA-DBT Reconstructed Volume-37558/DBT slices-78838


In [14]:
alldicoms, patientmetadata = load_series_by_index(metadatafile, seriesidx=14)

In [15]:
sampleds = alldicoms[0]

In [16]:
def process_element_val(element):
    elementval = ""
    if element.VM > 1:
        elementval = ', '.join([str(item) for item in element.value])
    elif element.VM == 1:
        elementval = str(element.repval)
    elementval = elementval.replace("'", '')
    if element.VR == 'PN':
        elementval = elementval.replace("^", ' ')
    return elementval 

In [17]:
dcmnote = ''
for element in sampleds:
    # print(type(element))
    # print(element.name, process_element_val(element), element.VM, element.VR)
    print(f"{element.name}: {process_element_val(element)}")
    dcmnote += f"{element.name}: {process_element_val(element)}, "

Specific Character Set: ISO_IR 100
Image Type: ORIGINAL, PRIMARY, LOCALIZER
Instance Creation Date: 19990908
Instance Creation Time: 133228
SOP Class UID: CT Image Storage
SOP Instance UID: 2.25.24988633710936769796383448125889166838
Study Date: 19990908
Series Date: 19990908
Acquisition Date: 19990908
Content Date: 19990908
Study Time: 133157
Series Time: 133157
Acquisition Time: 133221.484357
Content Time: 133228
Accession Number: 664B2583
Modality: CT
Manufacturer: GE MEDICAL SYSTEMS
Institution Name: Williams-Johnson Medical Center
Institution Address: Unit 4899 Box 3625 DPO AP 93015
Referring Physician's Name: THOMAS CHRISTOPHER
Study Description: FORFILE CT CH/AB/PEL - CD for 8155012288
Procedure Code Sequence: <Sequence, length 1>
Series Description: SCOUT
Performing Physician's Name: SMITH MEGHAN
Manufacturer's Model Name: LightSpeed VCT
Referenced Study Sequence: <Sequence, length 1>
Private Creator: GEMS_IDEN_01
Private Creator: GEIIS
Patient's Name: HOOVER RODNEY
Patient ID:

In [18]:
outputs = classifier(dcmnote)

In [19]:
entities = []
entitytype = None
entitystart = -1
temp = ""
for idx, item in enumerate(outputs):
    if idx == 0:
        temp = item['word']
        entitytype = entity_name(item['entity'])
        entitystart = item['start']
        continue
    previtem = outputs[idx-1]
    currententity = entity_name(item['entity'])
    if (item['index'] == previtem['index'] + 1) and (currententity == entitytype):
        temp += item['word']
    else:
        entities.append((process_enitity_val(temp), entitytype, entitystart))
        temp = item['word']
        entitytype = entity_name(item['entity'])
        entitystart = item['start']

In [20]:
for e in entities:
    print(f"{e[1]}: {e[0]}")

DATE: 19990908
ID: 133
DATE: 2
ID: 25
ID: 24988633710936769796383448125889166838
DATE: 19990908
DATE: 19990908
DATE: 19990908
DATE: 19990908
PHONE: 133221
PHONE: 484
ID: 664B25
HOSP: GE MEDICAL SYSTEM
HOSP: Williams-Johnson Medical Center
ID: 4899 Box 36
ID: 93
STAFF: THOMAS CHRISTOPHER
ID: 8155012288
PATIENT: SMITH M
STAFF: EG
PATIENT: HAN
PATIENT: HOOVER RODNEY
ID: 81550122
ID: 194207
PHONE: 0
AGE: 57
ID: 87009
ID: 660
ID: 636
