# UC1: Data

Tools

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import regex as re
import numpy as np

In [4]:
ds = pd.read_parquet('ecrfs-series-20230519.parquet')
seg_ds = pd.read_parquet('segments-20230519.parquet')

## Remove older duplicated series_uid by date

In [5]:
len(ds)

69957

In [6]:
sort = ds.sort_values(['series_uid','series_date'],ascending=[True,False])
ds = sort.drop_duplicates(subset='series_uid', keep='first')
ds = ds.sort_index()
len(ds)

69750

## Check segmentation availability

In [9]:
t = ds.series_uid.to_list()
print(f'Series found:{len(t)}')
mt = seg_ds[seg_ds.source_series_uid.isin(t)].copy()
print(f'Segmentations found:{mt.source_series_uid.nunique()}')
checkt = mt.source_series_uid.unique().tolist()

Series found:69750
Segmentations found:681


# Filter main dataset

### Exclude coil

In [None]:
erc_list = [    "8EIS_eCoil BodyL", 
                "ATDTORSO", 
                "Body E24 AA1", 
                "Body E24 AA2", 
                "Body E24 AA3", 
                "EIS_eCoil Torso"]

erc_list = [erc.upper() for erc in erc_list]

In [None]:
statement1 = ds.catboost_series_type_heuristics.str.upper().isin(include_list_catboost)

pattern = '|'.join(erc_list)
statement2 = ~ds.receive_coil_name.str.upper().str.contains(pattern)

In [15]:
statements =  statement1  & statement2

In [16]:
filtered_ds = ds[statements].copy()

Other data to consider to  exclude:

- T2 sequence: 
    * series_description: fs dixon,spair,spir,bh,star,kidneys,whole pelvis 
    * scan_options: fs
- ADC sequence: eADC
- DWI: pirads reccomend higher b-values: series description: DWI_Synthetic

In [17]:
len(filtered_ds)

36439

In [13]:
# exclude_list_series_description =['EADC','DIXON','SPAIR','SPIR','KIDNEYS','PELVIS','SYNTHETIC']
# exclude_list_scan_options = ['FS']
# exclude_list_scan_options_onlyT2 = ['T2']
# include_list_catboost = ['T2','ADC','DWI']
# include_list_image_plane = ['AX']

In [14]:
# pattern1 = '|'.join(exclude_list_series_description)
# statement1 = ~ds.series_description.str.upper().str.contains(pattern1)

# statement2 = ds.scan_options.str.upper().isin(exclude_list_scan_options) 
# statement2 = ~statement2

# statement3 = ds.catboost_series_type_heuristics.str.upper().isin(include_list_catboost)

# statement4 = ds.image_plane.str.upper().isin(include_list_image_plane)




# pattern2 = '|'.join(erc_list)
# statement5 = ~ds.receive_coil_name.str.upper().str.contains(pattern2)


# statement6 = ds.user_series_type.isin(include_list_user)


### and remove duplicates from sementation parquet

In [19]:
#Segmentation DF
onlylesion = seg_ds[seg_ds.labels.str.lower().str.contains('lesion')].copy()
sort = onlylesion.sort_values(['source_series_uid','transformation_dt'],ascending=[True,False])
onlylesion = sort.drop_duplicates(subset='source_series_uid', keep='first').copy()
onlylesion = onlylesion.sort_index().copy()
print(len(onlylesion))
source_series_uid_list = list(onlylesion.source_series_uid)



493


### and leave patients with series contain lesion(s)

In [20]:
#Series DF
onlyseg = filtered_ds[filtered_ds.series_uid.isin(source_series_uid_list)].copy() # Here it will keep only T2
patient_id_list_with_seg = list(onlyseg.patient_id) #keep these patients since they have segmentations in T2

In [21]:
new_df_patient_with_lesions = filtered_ds[filtered_ds.patient_id.isin(patient_id_list_with_seg)].copy()

### and leave segmentations with series in the "new_df_patient_with_lesions"

In [22]:
check_series = list(new_df_patient_with_lesions.series_uid)

In [23]:
existed_segs = onlylesion[onlylesion.source_series_uid.isin(check_series)]

Just checking

In [24]:
new_df_patient_with_lesions.patient_id.nunique()

440

In [25]:
print(existed_segs.source_series_uid.nunique())
print(len(existed_segs))


440
440


Number of studies for each patient

In [26]:
grouped = new_df_patient_with_lesions[['patient_id','study_uid']].copy().groupby('patient_id')['study_uid'].nunique()
grouped.value_counts()

1    440
Name: study_uid, dtype: int64

In [27]:
grouped = new_df_patient_with_lesions[['patient_id','study_uid','series_uid']].copy().groupby(['patient_id','study_uid'])['series_uid'].nunique()
grouped.value_counts()

3     220
4      47
7      39
6      39
9      35
8      30
5      15
11      4
10      4
14      2
2       2
13      1
12      1
17      1
Name: series_uid, dtype: int64

# Take T2 for all the patients based on segmentation source

In [28]:
source_list = list(existed_segs.source_series_uid)
T2 = new_df_patient_with_lesions[new_df_patient_with_lesions.series_uid.isin(source_list)]
len(T2)

440

In [29]:
T2.user_series_type.value_counts()

T2AX    313
Name: user_series_type, dtype: int64

In [30]:
T2.catboost_series_type_heuristics.value_counts()

T2    440
Name: catboost_series_type_heuristics, dtype: int64

In [32]:
grouped = new_df_patient_with_lesions.groupby(['patient_id','study_uid'])
manual_exist = 0
catboost_exist = 0
no_exist = 0
ADC = pd.DataFrame()

for group_name, group_data in grouped:

    a_value, b_value = group_name
    
    # if a_value in exclude_patient and b_value in exclude_patient.values():
    #     continue

    index = -1
    try:
        index = list(group_data.user_series_type).index('ADC')
        manual_exist += 1
        ADC  = pd.concat([ADC,group_data.iloc[[index]]])
    except:
        A = group_data[group_data.catboost_series_type_heuristics.isin(['ADC'])]
        if A.empty:
            no_exist+=1
        else:
            catboost_exist +=1
            if len(A) == 1:
                ADC  = pd.concat([ADC,A])
            else:
                catboost_exist -=1
                print(A.series_description)
    
print(f'ADC missing in {no_exist}')
print(f'ADC manual annotated {manual_exist}')
print(f'ADC catboost exist {catboost_exist}')
print(f'Total {len(ADC)}')

ADC missing in 39
ADC manual annotated 288
ADC catboost exist 113
Total 401


In [33]:
def extract_numbers_from_string(string):
    numbers = re.findall(r'\d+', string)
    numbers = [int(n) for n in numbers]
    number = max(numbers)
    return number

In [34]:
grouped = new_df_patient_with_lesions.groupby(['patient_id','study_uid'])
manual_exist = 0
catboost_exist = 0
no_exist = 0
DWI = pd.DataFrame()

for group_name, group_data in grouped:

    a_value, b_value = group_name
    
    # if a_value in exclude_patient and b_value in exclude_patient.values():
    #     continue

    index = -1
    try:
        index = list(group_data.user_series_type).index('DWI')
        manual_exist += 1
        DWI  = pd.concat([DWI,group_data.iloc[[index]]])
    except:
        A = group_data[group_data.catboost_series_type_heuristics.isin(['DWI'])]
        if A.empty:
            no_exist+=1
        else:
            catboost_exist +=1
            if len(A) == 1:
                DWI  = pd.concat([DWI,A])
            else:
                try:
                    bval = [extract_numbers_from_string(b) for b in A.series_description]
                    bvali = np.argmax(bval)
                    DWI  = pd.concat([DWI,A.iloc[[bvali]]])
                except:
                    catboost_exist -=1
                    print(a_value,b_value)
                    print(f'{A.series_description}\n')
    
print(f'DWI missing in {no_exist}')
print(f'DWI manual annotated {manual_exist}')
print(f'DWI catboost exist {catboost_exist}')
print(f'Total {len(DWI)}')

DWI missing in 4
DWI manual annotated 322
DWI catboost exist 114
Total 436


In [35]:
final_ds = pd.concat([T2,ADC,DWI]).sort_index().copy()

In [36]:
ngrouped = final_ds[['patient_id','study_uid']].copy().groupby('patient_id')['study_uid'].nunique()
ngrouped.value_counts()

1    440
Name: study_uid, dtype: int64

In [37]:
ngrouped = final_ds[['patient_id','study_uid','series_uid']].copy().groupby(['patient_id','study_uid'])['series_uid'].nunique()
ngrouped.value_counts()

3    399
2     39
1      2
Name: series_uid, dtype: int64

In [38]:
final_ds.to_parquet('UseCase1-v1.parquet',index=False)
existed_segs.to_parquet('Seg_UseCase1-v1.parquet',index=False)