In [47]:
from pathlib import Path
import multiprocessing as mp
import itertools
import pydicom
import pandas as pd

num_workers = mp.cpu_count() - 1

In [14]:
p = Path("/nfs/masi/khanms/massion/test")

dcm_list = list(p.glob("**/*.dcm"))

In [15]:
dcm_list[0]

PosixPath('/nfs/masi/khanms/massion/test/10291207324/54922574/402/1168.dcm')

# Simple example

In [16]:
test = [('sami', 'blue', 'a', 3), ('sami', 'blue', 'a', 50), ('bob', 'blue', 'a', 10), ('sami', 'green', 'a', 1)]

In [17]:
key, mx, mn, ct = [], [], [], []
def inst_info(l):
    key = [k for k,v in itertools.groupby(l, key=lambda x:(x[0], x[1], x[2]))]
    mx = [max(v)[-1] for k,v in itertools.groupby(l, key=lambda x:(x[0], x[1], x[2]))]
    mn = [min(v)[-1] for k,v in itertools.groupby(l, key=lambda x:(x[0], x[1], x[2]))]
    ct = [len(list(v)) for k,v in itertools.groupby(l, key=lambda x:(x[0], x[1], x[2]))]
    return zip(key, mx, mn, ct)

In [22]:
list( inst_info(test) )

[(('sami', 'blue', 'a'), 50, 3, 2),
 (('bob', 'blue', 'a'), 10, 10, 1),
 (('sami', 'green', 'a'), 1, 1, 1)]

# Generate info for each instance

In [23]:
def dcm_instance(dcm_file):
    '''
    For each dcm file -> (Subject, Session, Instance, InstanceNumber)
    '''
    ds = pydicom.dcmread(str(dcm_file))
    return( Path(dcm_file).parts[-4], Path(dcm_file).parts[-3], Path(dcm_file).parts[-2], int(ds[0x20, 0x13].value))

In [27]:
%%time

res = list( map(dcm_instance, dcm_list) )

CPU times: user 21.5 s, sys: 3.27 s, total: 24.8 s
Wall time: 35.7 s


In [28]:
%%time

pool = mp.Pool(processes=num_workers)
results = pool.map(dcm_instance, dcm_list)

CPU times: user 67.8 ms, sys: 59.6 ms, total: 127 ms
Wall time: 4.19 s


In [29]:
assert res == results #proof we generate the same output

Get the instance number, DICOM count, and diff for each.

In [121]:
def inst_info(l):
    key, mx, mn, ct = [], [], [], []
    key = [k for k,v in itertools.groupby(l, key=lambda x:(x[0], x[1], x[2]))]
    mx = [max(v)[-1] for k,v in itertools.groupby(l, key=lambda x:(x[0], x[1], x[2]))]
    mn = [min(v)[-1] for k,v in itertools.groupby(l, key=lambda x:(x[0], x[1], x[2]))]
    ct = [len(list(v)) for k,v in itertools.groupby(l, key=lambda x:(x[0], x[1], x[2]))]
    
    df = pd.DataFrame( zip(key, ct, mx, mn) , columns=['key', 'dcmN', 'max_instN', 'min_instN'])
    df['subject'], df['session'], df['inst'] = zip(*df['key'])
    df = df.assign(instanceN = df['max_instN'] - df['min_instN'] + 1,
                   delta_dcmN_instN = df['max_instN'] - df['min_instN'] + 1 - df['dcmN'])
    
    return df[['subject', 'session', 'dcmN', 'instanceN', 'delta_dcmN_instN']]

In [123]:
%%timeit

inst_info(res)

19.7 ms ± 77.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
