In [None]:
## Establish connection to Orthanc server

from utils import queryStudies
from pyorthanc import Orthanc, RemoteModality
orthanc = Orthanc('http://orthanc:8042')
orthanc.setup_credentials('user', 'password')
print(orthanc.get_modalities())
PACS = "FMC"
remote_modality = RemoteModality(orthanc, PACS)
print(remote_modality.echo())

In [None]:
## Get the database values.
## Choose which Modalities you want to access.
## Use this as a reference: https://www.dicomlibrary.com/dicom/modality/

import pandas as pd
from utils import create_dateRanges, queryStudies, checkpointer

start = pd.Timestamp('20000101')
end = pd.Timestamp('20220101')
date_ranges = create_dateRanges(start, end, 100)

# Include "*chest*" in search because this is ubiquitous in final study
#  descriptions post filtering.
print('\nBegining CR search...')


query_CRchest_studies = lambda x: queryStudies(
    orthanc,
    PACS,
    StudyDate=x,
    StudyDescription="*chest*",
    ModalitiesInStudy="*CR*"
)
query_CRchest_studies.__name__ = 'query_CRchest_studies'
studies_CRchest = checkpointer(query_CRchest_studies, date_ranges, 1)

In [None]:
print('\nBegining DX search...')

query_DXchest_studies = lambda x: queryStudies(
    orthanc,
    PACS,
    StudyDate=x,
    StudyDescription="*chest*",
    ModalitiesInStudy="*DX*"
)
query_DXchest_studies.__name__ = 'query_DXchest_studies'
studies_DXchest = checkpointer(query_DXchest_studies, date_ranges, 1)

# studies_DXchest = queryStudies(
#     orthanc,
#     PACS,
#     StudyDate=date_ranges[:1],
#     StudyDescription="*chest*",
#     ModalitiesInStudy="*DX*"
# )

In [None]:
print('\nBeginning CT search...')
query_CTchest_studies = lambda x: queryStudies(
    orthanc,
    PACS,
    StudyDate=x,
    StudyDescription="*chest*",
    ModalitiesInStudy="*CT*"
)
query_CTchest_studies.__name__ = 'query_CTchest_studies'
studies_CTchest = checkpointer(query_CTchest_studies, date_ranges, 1)

# studies_CTchest = queryStudies(
#     orthanc,
#     PACS,
#     StudyDate=date_ranges[:1],
#     StudyDescription="*chest*",
#     ModalitiesInStudy="*CT*"
# )

In [None]:
studies_all = [studies_CRchest,
                studies_DXchest,
                studies_CTchest]


In [None]:
print(len(studies_all[0]))
print(len(studies_all[1]))
print(len(studies_all[2]))

In [None]:
import itertools

studies_list = list(itertools.chain(*studies_all))
print(len(studies_list))
reduced_list = [x for x in studies_list if x != []]
print(len(reduced_list))

In [None]:
from utils import studieslist2df, months_between_df
import itertools 
import pandas as pd
# Convert list of queries into dataframe
studies_df = studieslist2df(studies_all)

# studiesList = list(itertools.chain(*studies_all))
# studiesList = [x for x in studiesList if x != []]
# print(studiesList.count([]))
# studies_df = pd.DataFrame(studiesList)

# studies_df 

# # Remove duplicates and nans
studies_df.dropna(inplace=True)
studies_df.drop_duplicates(subset='AccessionNumber', inplace=True)
# # Change study description to lowercase
studies_df['StudyDescription'] = studies_df['StudyDescription'].str.lower()
# # Add 'PatientAgeMonths' column to dataframe
datePairs = list(zip(studies_df['PatientBirthDate'],studies_df['StudyDate']))
studies_df['datePairs'] = datePairs
studies_df['PatientAgeMonths'] = studies_df.datePairs.apply(lambda x: months_between_df(x))    
studies_df


In [None]:
## Filter studies
from utils import months_between_df

studies_df_filter = studies_df

## Filter based on study descriptions
# To filter, write a term you want to remove in the remove_terms, then rerun 
#  this cell. The remaining study descriptions will be displayed, allowing
#  for the next terms to be selected.  
# The target here is to remove images which aren't useful. We want to be 
#  specific, not sensitive, meaning it's better to remove some possibly 
#  useful images if it means removing more of the disruptive CTs.
# keep_terms=['chest', 'torso', 'thora']

# remove_terms=['abdomen', 'pelvis', 'right', 'left', 'spine', 'brain', 'head', 'external', 'angiogram', 'whole body', 'skeletal', 'neck']
# keep_string="|".join(keep_terms)
# remove_string = "|".join(remove_terms)

# studies_df_filter = studies_df_filter[
#     ~studies_df_filter['StudyDescription'].str.contains(remove_string, regex=True)
#     & studies_df_filter['StudyDescription'].str.contains(keep_string, regex=True)
#     ]
print(studies_df_filter['StudyDescription'].value_counts())


# Remove non M/F sexes
studies_df_filter = studies_df_filter[studies_df_filter['PatientSex'].isin(['M', 'F'])]
print(studies_df_filter['PatientSex'].value_counts())

# # Filter by modalities
# # print(studies_df['ModalitiesInStudy'].value_counts())
# modalities = studies_df_filter[['ModalitiesInStudy']].values_counts()
# unique_modalities = np.unique(modalities)
# individual_modalities = [modality.split("\\") for modality in unique_modalities]
# individual_modalities = list(set(itertools.chain(*individual_modalities)))
# print(individual_modalities)

# # Check these against the DICOM format to see which should be removed
# # https://www.dicomlibrary.com/dicom/modality/
# remove_modalities = "PT"
# studies_df_filter = studies_df_filter[~studies_df_filter['ModalitiesInStudy'].str.contains(remove_modalities, regex=True)]
# print(studies_df_filter['ModalitiesInStudy'].value_counts())

# Filter based on age limits of 18 - 90
# 18 to 90 years is equivalent to 216 to 1080 months
studies_df_filter = studies_df_filter[studies_df_filter['PatientAgeMonths']>216]
studies_df_filter = studies_df_filter[studies_df_filter['PatientAgeMonths']<1080]
studies_df_filter['PatientAgeMonths'].hist(bins=100)
studies_df_filter

In [None]:
## Check how many studies have been filtered out
print("Initial number of studies: {}".format(len(studies_df)))
print("Final number of studies {}".format(len(studies_df_filter)))
print("Number of studies removed = {}".format(len(studies_df)-len(studies_df_filter)))

In [None]:
## Save minimum necessary information for studies
# studies_df_save = studies_df_filter[['AccessionNumber','ModalitiesInStudy','PatientSex','PatientAgeMonths','StudyDescription', 'NumberOfStudyRelatedSeries','StudyInstanceUID']]
studies_df_filter.to_json('../data_to_get/all_chest_studies.json')
studies_df_filter.to_csv('../data_to_get/all_chest_studies.csv')


In [2]:
# Test data was properly saved
import pandas as pd
studies_df = pd.read_json('../data_to_get/all_chest_studies.json')
print(len(studies_df))

582612


In [5]:
import itertools
import time

a = [[2,3,4], [], [12,3], [],[],[5,6], [[],2]]
print(list(itertools.chain(*a)))

studies_df=None


[2, 3, 4, 12, 3, 5, 6, [], 2]
