In [1]:
import numpy as np
import pandas as pd
from pyirr import intraclass_correlation
pd.options.display.max_rows = 200
import matplotlib.pyplot as plt
import seaborn as sns
import copy
sns.set_style('whitegrid')

In [2]:
# load data
# df = pd.read_csv('/Users/morton/Dicom Files/RADSARC_R/XNAT/extractions/extractions__20220910_1006_allRegions/radiomicFeatures/radiomicFeatures.csv')
df = pd.read_csv('/Users/morton/Dicom Files/RADSARC_R/XNAT/extractions/extractions__20221109_2030_allRegions/radiomicFeatures/radiomicFeatures.csv')
df.drop(list(df.filter(regex = 'source')), axis = 1, inplace = True)
df.drop(list(df.filter(regex = 'diagnostic')), axis = 1, inplace = True)

# dfRep = pd.read_csv('/Users/morton/Dicom Files/RADSARC_R/XNAT/extractions/extractions__20220914_1117_repro/radiomicFeatures/radiomicFeatures.csv')
dfRep = pd.read_csv('/Users/morton/Dicom Files/RADSARC_R/XNAT/extractions/extractions__20221109_1137_repro/radiomicFeatures/radiomicFeatures.csv')
dfRep.drop(list(dfRep.filter(regex = 'source')), axis = 1, inplace = True)
dfRep.drop(list(dfRep.filter(regex = 'diagnostic')), axis = 1, inplace = True)

subjectIDs = df.merge(dfRep, on='StudyPatientName').StudyPatientName

# select rows and make sure they are both sorted on SubjectID
df = df.loc[df['StudyPatientName'].isin(subjectIDs)]
df.sort_values('StudyPatientName', axis=0, inplace=True)

dfRep = dfRep.loc[dfRep['StudyPatientName'].isin(subjectIDs)]
dfRep.sort_values('StudyPatientName', axis=0, inplace=True)

# remove texture features
# groups = 'shape|firstorder|sarcomaFeature|StudyPatientName'
# df = df.filter(regex = groups)
# dfRep = dfRep.filter(regex = groups)

# remove high/mid/low_enhancing features
df.drop(list(df.filter(regex = 'low_enhancing_original')), axis = 1, inplace = True)
dfRep.drop(list(dfRep.filter(regex = 'low_enhancing_original')), axis = 1, inplace = True)
df.drop(list(df.filter(regex = 'mid_enhancing_original')), axis = 1, inplace = True)
dfRep.drop(list(dfRep.filter(regex = 'mid_enhancing_original')), axis = 1, inplace = True)
df.drop(list(df.filter(regex = 'high_enhancing_original')), axis = 1, inplace = True)
dfRep.drop(list(dfRep.filter(regex = 'high_enhancing_original')), axis = 1, inplace = True)

# remove these features
df.drop(list(df.filter(regex = 'calcificationDeleted')), axis = 1, inplace = True)
dfRep.drop(list(dfRep.filter(regex = 'calcificationDeleted')), axis = 1, inplace = True)

df.reset_index(inplace=True)
dfRep.reset_index(inplace=True)

df = df.rename(columns=lambda x:x.replace('lesion_original_',''))
dfRep = dfRep.rename(columns=lambda x:x.replace('lesion_original_',''))

In [3]:
# take logs of some features
logFeatures = ['shape_MeshVolume', 'shape_SurfaceArea', 'shape_SurfaceVolumeRatio', 'shape_VoxelVolume', 
               'firstorder_Energy', 'firstorder_TotalEnergy', 'firstorder_Variance']
for feat in df.columns:
    if any([y in feat for y in logFeatures]):
        df.loc[:,feat + '_log10'] = np.log10(df[feat])
        dfRep.loc[:,feat + '_log10'] = np.log10(dfRep[feat])

In [4]:
iccValues = []
featNames = []
for col in df.columns:
    if col == 'StudyPatientName' or col=='index':
        continue
    data = np.stack((df[col], dfRep[col]), axis=1)
    featNames.append(col)
    iccValues.append(intraclass_correlation(data, "twoway", "agreement").value)
iccDf = pd.DataFrame({'Feature':featNames, 'ICC':iccValues})

# display for feature with ICC under some threshold

iccThreshold = 0.75

iccDfgroup = iccDf.loc[iccDf['Feature'].str.contains('VolumeFraction'),:]

print('Group = ' + str('VolumeFraction').ljust(12) + ': # features with ICC > ' + str(iccThreshold) + ' = ' + str(np.sum(iccDfgroup.ICC>=iccThreshold)) + '/' + str(iccDfgroup.shape[0]))

iccDfgroup = iccDfgroup.sort_values('ICC', ascending=False)
display(iccDfgroup.style.hide_index())
print('\n\n\n')


for group in ['shape', 'firstorder']:
    iccDfgroup = iccDf[iccDf['Feature'].str.contains(group)]
    
    print('Group = ' + str(group).ljust(12) + ': # features with ICC > ' + str(iccThreshold) + ' = ' + str(np.sum(iccDfgroup.ICC>=iccThreshold)) + '/' + str(iccDfgroup.shape[0]))

    iccDfgroup = iccDfgroup.loc[iccDfgroup.ICC < iccThreshold, :]
    iccDfgroup.sort_values('ICC', ascending=False, inplace=True)
    display(iccDfgroup.style.hide_index())
    print('\n\n\n')
    
# do the same for all features except any in the groups listed (i.e. texture features)
iccDfgroup = copy.deepcopy(iccDf)
for group in ['VolumeFraction', 'shape', 'firstorder', 'histogram']:
    iccDfgroup = iccDfgroup.loc[~iccDfgroup.Feature.str.contains(group),:]
    
print('Group = ' + 'texture'.ljust(12) + ': # features with ICC > ' + str(iccThreshold) + ' = ' + str(np.sum(iccDfgroup.ICC>=iccThreshold)) + '/' + str(iccDfgroup.shape[0]))

iccDfgroup = iccDfgroup.loc[iccDfgroup.ICC < iccThreshold, :]
iccDfgroup.sort_values('ICC', ascending=False, inplace=True)
display(iccDfgroup.style.hide_index())


Group = VolumeFraction: # features with ICC > 0.75 = 4/4


Feature,ICC
lesion_sarcomaFeature_high enhancingVolumeFraction,0.999634
lesion_sarcomaFeature_calcificationVolumeFraction,0.999527
lesion_sarcomaFeature_low enhancingVolumeFraction,0.999387
lesion_sarcomaFeature_mid enhancingVolumeFraction,0.998163






Group = shape       : # features with ICC > 0.75 = 18/18


Feature,ICC






Group = firstorder  : # features with ICC > 0.75 = 20/21


Feature,ICC
firstorder_Minimum,0.55863






Group = texture     : # features with ICC > 0.75 = 54/73


Feature,ICC
glszm_SmallAreaLowGrayLevelEmphasis,0.709189
glrlm_ShortRunLowGrayLevelEmphasis,0.683499
gldm_LowGrayLevelEmphasis,0.625164
glrlm_LowGrayLevelRunEmphasis,0.610192
gldm_LargeDependenceLowGrayLevelEmphasis,0.572606
glszm_LowGrayLevelZoneEmphasis,0.540183
glrlm_LongRunHighGrayLevelEmphasis,0.527294
glrlm_LongRunLowGrayLevelEmphasis,0.434403
gldm_LargeDependenceHighGrayLevelEmphasis,0.418765
ngtdm_Busyness,0.414329
