In [6]:
import function as f
from glob import glob
import pandas as pd
import re
import pydicom
import matplotlib.pyplot as plt
import numpy as np
import shutil
from tqdm import tqdm
import os

In [7]:
def get_dicom(path):
    dicom = pydicom.read_file(path, force=True, stop_before_pixels=False)
    return dicom
def get_image(input, remove_max=False):
    if type(input)==str: # is path
        dicom = pydicom.read_file(input, force=True)
    else:
        dicom = input

    try:
        image = dicom.pixel_array
    except AttributeError:
        dicom.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian
        image = dicom.pixel_array
    except:
        dicom.decompress()
        image = dicom.pixel_array

    if remove_max is True:
        image[image == image.max()] = 0
        image[image <= threshold] = 0
    return image
def plot(input, cmap=plt.cm.gray) :
    if type(input) == np.ndarray:
        image = input
    else:
        image = get_image(input)

    fig = plt.figure()
    ax = plt.imshow(image, cmap=cmap)
    ax = plt.Axes(fig,[0,0,1,1])
    plt.axis('off')
    plt.show()

In [8]:
list_asan = glob("dicom/asan/*")
list_asan.sort()
len(list_asan)

3785

In [9]:
list_asan_new = []
for i in tqdm(range(len(list_asan))):
    path = list_asan[i]
    dicom = get_dicom(path)
    try:
        ID = dicom.PatientID
    except:
        ID = "NA"
    try:
        LR = dicom.ImageLaterality
    except:
        LR = "NA"
    try:
        VIEW = dicom.ViewPosition
    except:
        VIEW = "NA"
    try:
        DATE = dicom.StudyDate
    except:
        DATE = "NA"
    path_new = ID + '_' + LR + VIEW + '_' + DATE + '.dcm'
    list_asan_new.append(path_new)
    os.rename(path, 'dicom/asan/' + path_new)
    #shutil.copy(path, target)
df_asan = pd.DataFrame(zip([i.split('/')[-1] for i in list_asan],list_asan_new), columns=['original','new'])
df_asan['center'] = 'asan'
df_asan.to_csv("csv/tmp_asan.csv", index=False)

100%|██████████| 3785/3785 [03:23<00:00, 18.62it/s]


In [10]:
list_brmh = glob("dicom/brmh/*")
list_brmh.sort()
len(list_brmh)

5674

In [11]:
list_brmh_new = []
for i in tqdm(range(len(list_brmh))):
    path = list_brmh[i]
    dicom = get_dicom(path)
    try:
        #ID = dicom.PatientID
        ID = path.split('/')[-1].split("_")[0]
    except:
        ID = "NA"
    try:
        LR = dicom.ImageLaterality
    except:
        LR = "NA"
    try:
        VIEW = dicom.ViewPosition
    except:
        VIEW = "NA"
    try:
        DATE = dicom.StudyDate
    except:
        DATE = "NA"
    path_new = ID + '_' + LR + VIEW + '_' + DATE + '.dcm'
    list_brmh_new.append(path_new)
    os.rename(path, 'dicom/brmh/' + path_new)
df_brmh = pd.DataFrame(zip([i.split('/')[-1] for i in list_brmh],list_brmh_new), columns=['original','new'])
df_brmh['center'] = 'brmh'
df_brmh.to_csv("csv/tmp_brmh.csv", index=False)

100%|██████████| 5674/5674 [10:41<00:00,  8.85it/s]


In [12]:
list_ilsan = glob("dicom/ilsan/*")
list_ilsan.sort()
len(list_ilsan)

402

In [13]:
list_ilsan_new = []
for i in tqdm(range(len(list_ilsan))):
    path = list_ilsan[i]
    dicom = get_dicom(path)
    try:
        ID = dicom.PatientID
    except:
        ID = "NA"
    try:
        LR = dicom.ImageLaterality
    except:
        LR = "NA"
    try:
        VIEW = dicom.ViewPosition
    except:
        VIEW = "NA"
    try:
        DATE = dicom.StudyDate
    except:
        DATE = "NA"
    path_new = ID + '_' + LR + VIEW + '_' + DATE + '.dcm'
    list_ilsan_new.append(path_new)
    os.rename(path, 'dicom/ilsan/' + path_new)
df_ilsan = pd.DataFrame(zip([i.split('/')[-1] for i in list_ilsan],list_ilsan_new), columns=['original','new'])
df_ilsan['center'] = 'ilsan'
df_ilsan.to_csv("csv/tmp_ilsan.csv", index=False)

100%|██████████| 402/402 [00:44<00:00,  8.94it/s]


### NCC2018

In [14]:
list_ncc2018_1st = []
for path, currentDirectory, files in os.walk("dicom/ncc2018/NationalCancer1st/"):
    for file in files:
        list_ncc2018_1st.append(os.path.join(path, file))
len(list_ncc2018_1st)

0

In [15]:
list_ncc2018_1st_new = []
for i in tqdm(range(len(list_ncc2018_1st))):
    path = list_ncc2018_1st[i]
    dicom = get_dicom(path)
    try:
        ID = dicom.PatientID
        ID = re.sub('-','',ID)
        if ID == "ANONYMOUS" or "ANONYMIZED":
            ID = list_ncc2018_1st[i].split('/')[5]
        
    except:
        "NA"
    try:
        LR = dicom.ImageLaterality
    except:
        LR = "NA"
    try:
        VIEW = dicom.ViewPosition
    except:
        VIEW = "NA"
    try:
        DATE = dicom.StudyDate
    except:
        DATE = "NA"
    path_new = ID + '_' + LR + VIEW + '_' + DATE + '.dcm'
    list_ncc2018_1st_new.append(path_new)
    target = "/".join(path.split('/')[:-1])+'/'+path_new
    os.rename(path, target)
df_ncc2018_1st = pd.DataFrame(zip(ID, [i.split('/')[-1] for i in list_ncc2018_1st],list_ncc2018_1st_new), columns=['ID','original','new'])
df_ncc2018_1st['center'] = 'ncc2018_1st'
df_ncc2018_1st.to_csv("csv/tmp_ncc2018_1st.csv", index=False)

0it [00:00, ?it/s]


In [7]:
list_ncc2018_1st = glob("dicom/ncc2018/NationalCancer1st/*")
list_ncc2018_1st = [i for i in list_ncc2018_1st if len(i.split('/')[-1].split('_')[0])!=8]
list_ncc2018_1st[:10]

[]

In [345]:
list_ncc2018_1st_new = []
for i in tqdm(range(len(list_ncc2018_1st))):
    path = list_ncc2018_1st[i]
    dicom = get_dicom(path)
    try:
        ID = dicom.PatientID
        ID = re.sub('-','',ID)
        if ID == "ANONYMOUS" or "ANONYMIZED":
            ID = list_ncc2018_1st[i].split('/')[5]
        
    except:
        "NA"
    try:
        LR = dicom.ImageLaterality
    except:
        LR = "NA"
    try:
        VIEW = dicom.ViewPosition
    except:
        VIEW = "NA"
    try:
        DATE = dicom.StudyDate
    except:
        DATE = "NA"
    path_new = ID + '_' + LR + VIEW + '_' + DATE + '.dcm'
    list_ncc2018_1st_new.append(path_new)
    target = "/".join(path.split('/')[:-1])+'/'+path_new
    os.rename(path, target)

100%|██████████| 151/151 [00:00<00:00, 227.50it/s]


In [8]:
list_ncc2018_2nd = []
for path, currentDirectory, files in os.walk("dicom/ncc2018/NationalCancer2nd/"):
    for file in files:
        list_ncc2018_2nd.append(os.path.join(path, file))
len(list_ncc2018_2nd)

0

In [284]:
list_ncc2018_2nd_new = []
for i in tqdm(range(len(list_ncc2018_2nd))):
    path = list_ncc2018_2nd[i]
    dicom = get_dicom(path)
    try:
        ID = dicom.PatientID
        ID = re.sub('-','',ID)
        if ID == "ANONYMOUS":
            ID = list_ncc2018_2nd[i].split('/')[3]
    except:
        "NA"
    try:
        LR = dicom.ImageLaterality
    except:
        LR = "NA"
    try:
        VIEW = dicom.ViewPosition
    except:
        VIEW = "NA"
    try:
        DATE = dicom.StudyDate
    except:
        DATE = "NA"
    path_new = ID + '_' + LR + VIEW + '_' + DATE + '.dcm'
    list_ncc2018_2nd_new.append(path_new)
    target = "/".join(path.split('/')[:-1])+'/'+path_new
    os.rename(path, target)
df_ncc2018_2nd = pd.DataFrame(zip([i.split('/')[-1] for i in list_ncc2018_2nd],list_ncc2018_2nd_new), columns=['original','new'])
df_ncc2018_2nd['center'] = 'ncc2018_2nd'
df_ncc2018_2nd.to_csv("csv/tmp_ncc2018_2nd.csv", index=False)

100%|██████████| 2808/2808 [00:11<00:00, 236.86it/s]


In [16]:
list_ncc2018 = glob("dicom/ncc2018/*/*/*")

In [17]:
for i in tqdm(range(len(list_ncc2018))):
    path = list_ncc2018[i]
    target = re.sub(".dcm","_"+path.split('/')[3].upper() +".dcm",path)
    os.rename(path, target)

0it [00:00, ?it/s]


### NCC2020

In [18]:
list_ncc2020 = glob("dicom/ncc2020/*/*")
list_ncc2020.sort()
len(list_ncc2020)

0

In [550]:
list_ncc2020_new = []
for i in tqdm(range(len(list_ncc2020))):
    path = list_ncc2020[i]
    dicom = get_dicom(path)
    try:
        ID = list_ncc2020[i].split('/')[-1].split('_')[0]
    except:
        "NA"
    try:
        LR = dicom.ImageLaterality
    except:
        LR = "NA"
    try:
        VIEW = dicom.ViewPosition
    except:
        VIEW = "NA"
    try:
        DATE = dicom.StudyDate
    except:
        DATE = "NA"
    path_new = ID + '_' + LR + VIEW + '_' + DATE + '.dcm'
    list_ncc2020_new.append(path_new)
    target = "/".join(path.split('/')[:-1])+'/'+path_new
    os.rename(path, target)

100%|██████████| 30000/30000 [20:45<00:00, 24.09it/s]


### Samsung

In [19]:
list_samsung = glob("dicom/samsung/*")

In [20]:
for i in tqdm(range(len(list_samsung))):
    path = list_samsung[i]
    i = path.split('_')
    target = i[0]+"_"+i[2].split('.')[0]+"_"+i[1]+'.dcm'
    os.rename(path, target)

100%|██████████| 15650/15650 [00:00<00:00, 57756.92it/s]


In [21]:
list_twin = glob("dicom/twin/*")

### twin

In [22]:
for i in tqdm(range(len(list_twin))):
    path = list_twin[i]
    i = path.split('_')
    target = i[0]+"_"+i[2].split('.')[0]+"_"+i[1]+'.dcm'
    os.rename(path, target)

100%|██████████| 3697/3697 [00:00<00:00, 37228.41it/s]


### get Birthday

In [23]:
list_dicom = sorted(glob("dicom/*/*"))
len(list_dicom)

67749

In [24]:
df_dicom = pd.DataFrame(list_dicom,columns=['file'])
df_dicom['ID'] = [i.split('/')[-1].split('_')[0] for i in df_dicom.file]
df_dicom['CENTER'] = [i.split('/')[-2] for i in df_dicom.file]
df_dicom = df_dicom.drop_duplicates(['ID','CENTER'])

In [27]:
result = {}
list_dicom = df_dicom.file.tolist()
for i in tqdm(range(len(list_dicom)),miniters=50, mininterval=5):
    file = list_dicom[i]
    try:
        bd = get_dicom(list_dicom[i]).PatientBirthDate
    except:
        bd = ""
    result[file] = bd

100%|██████████| 17892/17892 [16:27<00:00, 18.11it/s]


In [30]:
df_dicom.file

0         dicom/asan/10512416_LCC_20070829.dcm
1         dicom/asan/10516254_LCC_20080709.dcm
2         dicom/asan/10531695_LCC_20090205.dcm
3         dicom/asan/10531811_LCC_20080218.dcm
4         dicom/asan/10532852_LCC_20071214.dcm
                         ...                  
17887    dicom/twin/E2005131_20071119_LMLO.dcm
17888    dicom/twin/E2005171_20071120_LMLO.dcm
17889    dicom/twin/E2005191_20071122_LMLO.dcm
17890    dicom/twin/E2005201_20071122_LMLO.dcm
17891     dicom/twin/E3000450_20120910_LCC.dcm
Name: file, Length: 17892, dtype: object

In [31]:
df_dicom = pd.DataFrame.from_dict(result, orient="index", columns=['DATE_birth']).reset_index().rename(columns={"index":"file"})
df_dicom['CENTER'] = [i.split('/')[1] for i in df_dicom.file]
df_dicom['ID'] = [i.split('/')[-1].split('_')[0] for i in df_dicom.file]

In [35]:
df_dicom[["CENTER","ID","DATE_birth"]].to_csv("csv/birth.csv",index=False)

### check file

In [36]:
list_files = glob("dicom/*/*")
list_files.sort()
len(list_files)

67749

In [43]:
total = {}
for file in tqdm(list_files):
    center = file.split("/")[1]
    dicom = file.split("/")[2]
    d = dicom.split("_")
    if len(d)==3:
        ID = d[0]
        LR = d[1]
        DATE = re.sub(".dcm","",d[2])
        OTHER = ""
    elif len(d)==4:
        ID = d[0]
        LR = d[1]
        DATE = d[2]
        OTHER = re.sub(".dcm","",d[3])
    total[file] = [center, dicom, ID, LR, DATE, OTHER]

100%|██████████| 67749/67749 [00:00<00:00, 161727.22it/s]


In [44]:
df_total = pd.DataFrame.from_dict(total, orient="index", columns=["CENTER","DICOM","ID","LR","DATE","MEMO"])
df_total

Unnamed: 0,CENTER,DICOM,ID,LR,DATE,MEMO
dicom/asan/10512416_LCC_20070829.dcm,asan,10512416_LCC_20070829.dcm,10512416,LCC,20070829,
dicom/asan/10512416_RCC_20070829.dcm,asan,10512416_RCC_20070829.dcm,10512416,RCC,20070829,
dicom/asan/10516254_LCC_20080709.dcm,asan,10516254_LCC_20080709.dcm,10516254,LCC,20080709,
dicom/asan/10531695_LCC_20090205.dcm,asan,10531695_LCC_20090205.dcm,10531695,LCC,20090205,
dicom/asan/10531695_RCC_20090205.dcm,asan,10531695_RCC_20090205.dcm,10531695,RCC,20090205,
...,...,...,...,...,...,...
dicom/twin/E2005201_20071122_RMLO.dcm,twin,E2005201_20071122_RMLO.dcm,E2005201,20071122,RMLO,
dicom/twin/E3000450_20120910_LCC.dcm,twin,E3000450_20120910_LCC.dcm,E3000450,20120910,LCC,
dicom/twin/E3000450_20120910_LMLO.dcm,twin,E3000450_20120910_LMLO.dcm,E3000450,20120910,LMLO,
dicom/twin/E3000450_20120910_RCC.dcm,twin,E3000450_20120910_RCC.dcm,E3000450,20120910,RCC,


In [45]:
list_vendor = []
for i in tqdm(range(len(df_total))):
    path = df_total.index[i]
    try:
        dicom = get_dicom(path)
    except:
        vendor = "NA"
        pass

    try:
        vendor = dicom.Manufacturer
    except:
        vendor = "NA"
    list_vendor.append(vendor)

100%|██████████| 67749/67749 [1:16:01<00:00, 14.85it/s]


In [46]:
df_total["VENDOR"] = list_vendor

In [47]:
df_total.value_counts("VENDOR")

VENDOR
GE MEDICAL SYSTEMS            31960
HOLOGIC, Inc.                 29923
LORAD                          5829
SIEMENS                          19
FUJIFILM Corporation              6
MEDI-FUTURE MEDICAL SYSTEM        4
LUMISYS                           3
FUJI PHOTO FILM Co., ltd.         2
Agfa-Gevaert AG                   1
KODAK                             1
KONICA MINOLTA                    1
dtype: int64

In [48]:
list_vendor[6000]

'HOLOGIC, Inc.'

In [49]:
df_total.to_csv("csv/dicom.csv", index_label="FILE")