In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500, 'display.max_rows', 500)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
import os, warnings, copy
import matplotlib.pyplot as plt
import sys

sys.path.append('/data/users/morton/git/icrpythonradiomics/machineLearning')
from featureSelection import featureSelection_correlation, featureSelection_groupName


In [2]:
# open clinical spreadsheet
clinicalSpreadsheet = '/Users/morton/Dicom Files/RADSARC_R/ClinicalData/Clinical data for analysis.xlsx'
dfClinical = pd.read_excel(clinicalSpreadsheet, sheet_name='220818_Completed segs', engine='openpyxl')
dfClinical = dfClinical[['Anon Code', 'Grade', 'subtype']]

In [3]:
# open radiomics data
dfRad = pd.read_csv('/Users/morton/Dicom Files/RADSARC_R/XNAT/extractions/extractions__20220910_1006_allRegions/radiomicFeatures/radiomicFeatures.csv')
dfRad.drop(list(dfRad.filter(regex = 'source')), axis = 1, inplace = True)
dfRad.drop(list(dfRad.filter(regex = 'diagnostic')), axis = 1, inplace = True)

# featureSets = ['lesion_original_shape', 'lesion_original_firstorder']
# featureSets = ['lesion_original_firstorder']
# featureSets = ['lesion_original_shape_SurfaceVolumeRatio', 
#                'lesion_original_shape_Sphericity', 
#                'lesion_original_firstorder_Mean',
#               'lesion_original_firstorder_10Percentile']
featureSets = ['lesion_original']

dfRad = dfRad.filter(regex='|'.join(featureSets) + '|StudyPatientName')

dfRad.rename(lambda x:x.replace('lesion_original_',''), axis=1, inplace=True)

# for featureSet in featureSets:
#     dfRad.rename(lambda x:x.replace(featureSet+'_',''), axis=1, inplace=True)

In [4]:
df = dfClinical.merge(dfRad, left_on='Anon Code', right_on='StudyPatientName')
df.drop('Anon Code', axis=1, inplace=True)
df.drop('StudyPatientName', axis=1, inplace=True)
df.drop('Grade', axis=1, inplace=True)
target = 'subtype'

In [5]:
# add pair-wise interactions
# features = list(set(df.columns) - set([target]))
# for i in range(len(features)-1):
#     for j in range(i+1, len(features)):
#         df['interaction_' + features[i] + '_x_' + features[j]] = df[features[i]].multiply(df[features[j]])

In [6]:
df

Unnamed: 0,subtype,shape_Elongation,shape_Flatness,shape_LeastAxisLength,shape_MajorAxisLength,shape_Maximum2DDiameterColumn,shape_Maximum2DDiameterRow,shape_Maximum2DDiameterSlice,shape_Maximum3DDiameter,shape_MeshVolume,shape_MinorAxisLength,shape_Sphericity,shape_SurfaceArea,shape_SurfaceVolumeRatio,shape_VoxelVolume,firstorder_10Percentile,firstorder_90Percentile,firstorder_Energy,firstorder_Entropy,firstorder_InterquartileRange,firstorder_Kurtosis,firstorder_Maximum,firstorder_MeanAbsoluteDeviation,firstorder_Mean,firstorder_Median,firstorder_Minimum,firstorder_Range,firstorder_RobustMeanAbsoluteDeviation,firstorder_RootMeanSquared,firstorder_Skewness,firstorder_TotalEnergy,firstorder_Uniformity,firstorder_Variance,glcm_Autocorrelation,glcm_JointAverage,glcm_ClusterProminence,glcm_ClusterShade,glcm_ClusterTendency,glcm_Contrast,glcm_Correlation,glcm_DifferenceAverage,glcm_DifferenceEntropy,glcm_DifferenceVariance,glcm_JointEnergy,glcm_JointEntropy,glcm_Imc1,glcm_Imc2,glcm_Idm,glcm_Idmn,glcm_Id,glcm_Idn,glcm_InverseVariance,glcm_MaximumProbability,glcm_SumEntropy,glcm_SumSquares,glrlm_GrayLevelNonUniformity,glrlm_GrayLevelNonUniformityNormalized,glrlm_GrayLevelVariance,glrlm_HighGrayLevelRunEmphasis,glrlm_LongRunEmphasis,glrlm_LongRunHighGrayLevelEmphasis,glrlm_LongRunLowGrayLevelEmphasis,glrlm_LowGrayLevelRunEmphasis,glrlm_RunEntropy,glrlm_RunLengthNonUniformity,glrlm_RunLengthNonUniformityNormalized,glrlm_RunPercentage,glrlm_RunVariance,glrlm_ShortRunEmphasis,glrlm_ShortRunHighGrayLevelEmphasis,glrlm_ShortRunLowGrayLevelEmphasis,glszm_GrayLevelNonUniformity,glszm_GrayLevelNonUniformityNormalized,glszm_GrayLevelVariance,glszm_HighGrayLevelZoneEmphasis,glszm_LargeAreaEmphasis,glszm_LargeAreaHighGrayLevelEmphasis,glszm_LargeAreaLowGrayLevelEmphasis,glszm_LowGrayLevelZoneEmphasis,glszm_SizeZoneNonUniformity,glszm_SizeZoneNonUniformityNormalized,glszm_SmallAreaEmphasis,glszm_SmallAreaHighGrayLevelEmphasis,glszm_SmallAreaLowGrayLevelEmphasis,glszm_ZoneEntropy,glszm_ZonePercentage,glszm_ZoneVariance,gldm_DependenceEntropy,gldm_DependenceNonUniformity,gldm_DependenceNonUniformityNormalized,gldm_DependenceVariance,gldm_GrayLevelNonUniformity,gldm_GrayLevelVariance,gldm_HighGrayLevelEmphasis,gldm_LargeDependenceEmphasis,gldm_LargeDependenceHighGrayLevelEmphasis,gldm_LargeDependenceLowGrayLevelEmphasis,gldm_LowGrayLevelEmphasis,gldm_SmallDependenceEmphasis,gldm_SmallDependenceHighGrayLevelEmphasis,gldm_SmallDependenceLowGrayLevelEmphasis,ngtdm_Busyness,ngtdm_Coarseness,ngtdm_Complexity,ngtdm_Contrast,ngtdm_Strength,histogram_5Percentile,histogram_15Percentile,histogram_20Percentile,histogram_25Percentile,histogram_30Percentile,histogram_35Percentile,histogram_40Percentile,histogram_45Percentile,histogram_55Percentile,histogram_60Percentile,histogram_65Percentile,histogram_70Percentile,histogram_75Percentile,histogram_80Percentile,histogram_85Percentile,histogram_95Percentile
0,LMS,0.832921,0.798058,50.502192,63.281392,72.945185,72.532751,64.257295,75.099933,116595.4,52.708409,0.757343,15239.796482,0.130707,116855.0,4.681213,58.148197,36141820.0,1.941701,27.260132,5.443797,106.783615,17.366682,31.794965,33.647402,-125.144635,231.928249,11.371108,39.324787,-0.879974,180709100.0,0.320175,535.51905,61.405001,7.812533,31.676884,-2.984671,2.436771,0.959456,0.434987,0.701928,1.430707,0.466753,0.119122,3.63113,-0.076343,0.500399,0.674708,0.992265,0.688946,0.943123,0.504461,0.243837,2.628258,0.849057,80555.29,0.27447,1.175761,60.717691,2.801018,173.914765,0.04752,0.018233,3.285204,163533.3,0.557195,0.697669,0.746542,0.775021,46.649809,0.014521,1172.94868,0.219694,2.231184,57.885746,266.227571,16945.54,4.2063,0.021081,1778.477056,0.333111,0.59854,33.436616,0.013806,4.772793,0.228446,247.065839,4.365159,3731.801506,0.159677,3.035793,7482.814,0.939881,61.318985,15.892302,996.34085,0.260451,0.017668,0.213099,12.351136,0.00447,29.030057,0.000273,41.397643,0.011702,0.019347,-12.0,7.0,12.0,16.0,20.0,24.0,27.0,30.0,37.0,40.0,43.0,47.0,50.0,54.0,59.0,74.0
1,LPS,0.724761,0.479183,124.883675,260.617946,269.473561,281.391187,227.407124,295.824272,3514855.0,188.885821,0.487473,229335.573304,0.065248,3516425.0,-98.073802,26.661653,3049740000.0,2.791134,72.078512,2.4466,210.281818,39.91146,-46.043387,-57.512397,-265.466942,475.74876,30.597563,65.851509,0.633067,15248700000.0,0.168574,2216.427734,96.504503,9.650105,484.867231,34.593569,14.015937,0.496017,0.93164,0.444018,1.118878,0.298865,0.076366,4.234324,-0.481562,0.965314,0.783179,0.998765,0.786342,0.978968,0.401414,0.188339,3.667722,3.627989,1082025.0,0.153843,3.642908,97.866819,5.329758,513.964346,0.063802,0.011844,4.582618,3002686.0,0.426924,0.555591,2.090177,0.677537,66.980161,0.007958,12586.826704,0.152449,3.835399,101.40904,2425.16575,220718.5,30.457324,0.011594,23595.545928,0.285785,0.551557,56.045378,0.006462,5.793374,0.117398,2352.608426,5.737755,89853.230143,0.127762,4.418613,118555.9,3.624034,96.93161,27.15412,2618.001247,0.324499,0.011933,0.124438,12.399264,0.001467,216.689211,1.8e-05,88.745656,0.010782,0.003792,-108.0,-95.0,-91.0,-86.0,-81.0,-77.0,-71.0,-65.0,-49.0,-39.0,-30.0,-22.0,-13.0,-1.0,14.0,43.0
2,LPS,0.683847,0.546616,132.578843,242.544779,266.300582,283.086559,230.262893,290.210269,3137952.0,165.863518,0.436705,237352.075561,0.075639,3139355.0,-104.045551,32.642492,3064230000.0,2.928552,77.887246,2.519586,264.719625,42.92026,-47.175847,-58.670628,-202.018093,466.737718,31.911059,69.859503,0.671622,15321150000.0,0.149631,2654.789687,62.219854,7.623476,712.97579,46.420516,16.904654,0.494786,0.943126,0.432143,1.126578,0.308039,0.06631,4.394081,-0.501418,0.973221,0.790171,0.998769,0.793812,0.979556,0.382437,0.149186,3.8235,4.34986,1010228.0,0.148218,4.29177,61.897989,4.281195,274.312222,0.088252,0.021329,4.519584,3173752.0,0.465645,0.60308,1.531717,0.708234,43.929634,0.015091,11447.382715,0.136423,4.531268,64.017125,898.529406,60044.92,17.858886,0.021238,21612.200129,0.257561,0.520101,33.992002,0.011161,6.10608,0.133644,842.540362,5.88248,83818.488494,0.133496,4.172807,93948.91,4.331687,62.292465,24.365475,1542.6149,0.506132,0.021194,0.134973,8.574264,0.002884,330.964366,2e-05,84.984713,0.018246,0.003553,-115.0,-100.0,-95.0,-90.0,-84.0,-79.0,-72.0,-66.0,-51.0,-42.0,-33.0,-23.0,-11.0,2.0,15.0,56.0
3,LPS,0.914293,0.651282,207.671486,318.865755,350.772006,372.11423,353.865794,372.651312,12710670.0,291.536877,0.585162,450111.311638,0.035412,12713500.0,-79.886667,-9.796667,7668966000.0,2.244373,36.721389,10.191297,615.746667,22.296779,-46.587201,-49.333333,-422.026667,1037.773333,15.251287,54.918774,1.048527,38344830000.0,0.25287,845.704459,245.669359,15.635863,262.764421,11.816963,5.234308,0.477704,0.832737,0.419118,1.097041,0.302044,0.118405,3.650442,-0.371181,0.900318,0.795913,0.99973,0.798669,0.990283,0.379697,0.234354,3.127266,1.428003,6055455.0,0.227341,1.744282,249.13987,4.671734,1129.871217,0.019656,0.004128,4.012187,11863730.0,0.445401,0.581972,1.719197,0.692161,174.242093,0.002839,49675.034889,0.171594,3.295894,260.577645,4764.349192,1120099.0,20.390083,0.004039,73325.724188,0.253292,0.512673,135.683942,0.00206,5.777437,0.113852,4687.202002,5.154302,341056.905603,0.134132,4.045203,642972.5,1.434425,245.957525,25.62188,6174.9473,0.108012,0.004161,0.121079,31.068774,0.000493,269.253295,3e-06,466.474154,0.001017,0.009293,-95.0,-79.0,-73.0,-69.0,-65.0,-61.0,-57.0,-53.0,-46.0,-41.0,-37.0,-32.0,-27.0,-21.0,-14.0,8.0
4,LMS,0.514362,0.358723,20.466013,57.052381,42.201896,58.258047,43.139309,60.049979,18691.04,29.345593,0.557026,6114.546056,0.327138,18950.0,15.246591,65.220718,9186810.0,1.857361,23.712974,29.78007,90.936911,17.807089,39.167666,43.334965,-312.507242,403.444152,10.014164,49.233733,-3.937494,45934050.0,0.356598,889.854422,229.431505,15.111955,640.860515,-38.635198,4.724548,0.483263,0.814408,0.396062,1.102137,0.326398,0.194021,3.072236,-0.302742,0.815962,0.810257,0.998347,0.813674,0.97824,0.343476,0.345429,2.609531,1.301953,13875.32,0.312282,1.957408,225.484538,3.234471,749.161505,0.014789,0.005148,3.406472,21414.12,0.481953,0.651305,0.877079,0.717094,159.299514,0.003921,134.088235,0.197189,4.398527,211.716176,136.352941,32085.83,0.585582,0.007043,176.247059,0.259187,0.522669,106.420972,0.004765,5.345775,0.17942,105.288711,4.54091,547.305013,0.144408,3.685273,1351.508,1.486287,228.556992,21.343008,5001.381266,0.092353,0.00486,0.159095,33.881026,0.001154,1.443743,0.001864,77.332146,0.007155,0.772468,12.0,26.0,30.0,33.0,36.0,39.0,41.0,44.0,48.0,50.0,52.0,55.0,58.0,61.0,65.0,76.0
5,LPS,0.630863,0.51952,111.371132,214.373209,224.017856,242.266382,177.259696,242.540718,2048612.0,135.24011,0.509798,153012.146349,0.074691,2049485.0,-54.807791,52.715176,762706100.0,2.414934,37.726614,3.544287,215.419327,31.145898,15.213962,28.103298,-241.765991,457.185318,19.65194,43.136136,-1.15319,3813531000.0,0.254141,1629.261605,125.994407,11.120143,365.617023,-39.07879,10.004611,0.657341,0.876694,0.546284,1.248311,0.358915,0.111957,3.956414,-0.352266,0.90321,0.737926,0.99819,0.744611,0.972948,0.455995,0.267509,3.210257,2.665488,878090.8,0.197035,2.976494,123.302259,4.296364,560.82397,0.036141,0.009156,4.11798,2113465.0,0.474241,0.604017,1.555407,0.715334,86.997756,0.006683,11222.900604,0.179285,3.811234,120.331736,3402.329547,486441.2,24.001327,0.009664,19075.53334,0.304731,0.57031,70.762195,0.005367,5.40663,0.152716,3359.452135,5.100636,55658.552044,0.135787,3.962826,104171.8,2.691243,126.10943,22.767649,2975.436895,0.191357,0.008862,0.152404,18.758316,0.001424,145.928903,2.2e-05,90.56192,0.010211,0.004697,-76.0,-38.0,-17.0,-1.0,8.0,14.0,19.0,23.0,30.0,33.0,36.0,40.0,44.0,48.0,52.0,67.0
6,LPS,0.744668,0.493732,54.086218,109.545797,117.647779,117.170815,107.703296,127.992187,337873.5,81.575226,0.67957,34521.191252,0.102172,338300.0,-40.073765,121.252259,457060000.0,3.081677,101.291019,2.123543,298.176667,52.678867,54.993447,73.561605,-111.861728,410.038395,41.241843,82.190311,-0.581984,2285300000.0,0.137128,3730.967921,65.677598,7.739102,1206.482929,-70.220314,23.678698,0.543093,0.955157,0.465995,1.167574,0.325942,0.062613,4.593564,-0.503837,0.977072,0.774674,0.998134,0.779119,0.974334,0.405192,0.167161,3.988715,6.055448,99384.18,0.120241,6.019861,60.516334,3.109125,227.224081,0.078206,0.031559,4.429741,450601.0,0.545165,0.678671,0.938018,0.766832,43.962817,0.025383,1540.17574,0.118094,5.439916,51.555283,298.397638,27541.92,4.187885,0.036622,3196.043245,0.245058,0.503755,25.988613,0.018613,6.120407,0.192758,271.483803,5.7948,9886.769642,0.146124,3.757051,9278.05,6.051463,65.313243,19.416465,1484.792285,0.462309,0.029155,0.172804,9.490201,0.005995,37.346355,0.000182,73.435466,0.039489,0.018747,-61.0,-27.0,-11.0,6.0,22.0,37.0,52.0,63.0,82.0,90.0,96.0,101.0,107.0,112.0,118.0,133.0
7,LPS,0.650335,0.509524,101.872264,199.936056,187.480666,228.547588,163.168624,229.499455,1682282.0,130.025329,0.518659,131886.742207,0.078398,1682980.0,-90.651826,-49.428509,1836179000.0,1.618469,20.074689,13.284648,198.362348,13.820128,-71.161893,-74.391696,-440.843817,639.206164,8.491806,73.858925,2.009251,9180893000.0,0.393319,391.125804,245.141887,15.640356,62.188537,6.461244,2.404793,0.320221,0.764977,0.302466,0.926368,0.228735,0.234095,2.672792,-0.331169,0.808592,0.85051,0.999527,0.851462,0.988821,0.289195,0.356762,2.333802,0.681254,985093.7,0.3376,0.994682,250.283035,7.557522,1820.954618,0.031599,0.004062,3.843494,1044919.0,0.358102,0.481608,3.246182,0.616917,156.988567,0.002477,5235.175456,0.201082,2.444069,266.692913,6107.478702,1455972.0,25.729496,0.003928,7259.992971,0.278855,0.54085,146.587719,0.002131,5.400409,0.077348,5940.330104,4.539923,42056.907575,0.124948,4.736594,132389.4,0.71488,245.591222,34.314026,8260.440668,0.143473,0.00412,0.091236,23.69962,0.000365,74.032379,2.1e-05,116.338893,0.00117,0.017858,-99.0,-90.0,-87.0,-85.0,-83.0,-80.0,-78.0,-76.0,-72.0,-70.0,-68.0,-65.0,-62.0,-59.0,-54.0,-35.0
8,LPS,0.637484,0.61746,93.868179,152.023016,143.808901,155.128979,146.727639,164.015243,972233.3,96.912286,0.717046,66188.745988,0.068079,972810.0,4.400585,47.796336,193461100.0,1.668912,22.262421,5.512442,168.702887,13.826938,25.802316,25.364374,-305.678061,474.380949,9.267818,31.53318,0.092814,967305300.0,0.374864,328.581922,211.485016,14.533459,15.102324,0.517725,1.72011,0.665764,0.441912,0.550585,1.255172,0.36262,0.159391,3.153278,-0.095083,0.519669,0.736194,0.998345,0.743123,0.974029,0.456956,0.237703,2.396916,0.596468,703078.6,0.330038,0.783339,212.370708,4.017609,846.688479,0.01923,0.004788,3.362191,986772.9,0.463209,0.608289,1.31502,0.706146,150.21829,0.003389,5726.112531,0.225775,1.917701,213.018137,2258.343033,471910.0,10.859333,0.00491,8773.295955,0.345923,0.609772,129.363426,0.003043,4.750982,0.130354,2199.492683,4.337196,28391.432284,0.145925,3.46036,72934.34,0.61067,211.761613,22.021566,4637.693902,0.105243,0.004783,0.144249,30.644015,0.000705,102.491477,2.9e-05,60.959941,0.003768,0.006907,-8.0,5.0,9.0,12.0,15.0,18.0,20.0,23.0,28.0,30.0,33.0,36.0,39.0,43.0,47.0,61.0
9,LMS,0.793122,0.706206,71.152299,100.752894,102.449988,115.602768,107.335921,122.298814,386684.4,79.909372,0.612343,41917.379923,0.108402,387250.0,32.053149,95.946167,390006600.0,2.127275,31.551802,4.509484,197.362392,20.148559,65.838719,68.763307,-89.383425,286.745817,13.263064,70.961904,-0.775875,1950033000.0,0.279043,700.85482,51.820373,7.160973,57.081328,-6.119438,3.370516,1.207167,0.472586,0.797336,1.557933,0.571423,0.092353,4.003687,-0.091662,0.565055,0.64182,0.991844,0.662171,0.940848,0.505777,0.165883,2.825214,1.144421,243286.2,0.247139,1.444659,51.431173,2.683905,142.03796,0.055511,0.02273,3.446801,554002.6,0.562777,0.706125,0.67834,0.778762,39.611266,0.018338,3223.489539,0.184263,2.608329,49.190865,237.031897,12753.31,4.509088,0.02701,6095.978393,0.348461,0.612744,30.147529,0.017606,5.020146,0.225875,217.431512,4.589437,13007.136062,0.167942,2.724514,21611.89,1.202573,52.086895,14.814254,793.433441,0.292437,0.021817,0.217542,10.896287,0.005688,100.708945,7.9e-05,49.266899,0.013495,0.005657,11.0,35.0,42.0,47.0,52.0,57.0,61.0,65.0,72.0,76.0,80.0,84.0,88.0,93.0,98.0,116.0


In [13]:
random_state = 42
np.random.seed(random_state)

X = df.drop(target, axis=1)
y = df[target]

correlationHierarchy = ['shape_MeshVolume', 'shape', 'firstorder']
fsc = featureSelection_correlation(threshold=0.9, exact=False, featureGroupHierarchy=correlationHierarchy)

model = LogisticRegressionCV(penalty='l1', multi_class='multinomial', solver='saga', Cs=20, cv=5, random_state=random_state, max_iter=50000)

pipe = Pipeline([('fsc', fsc),
                 ('scaler', StandardScaler()), 
                 ('classifier', model)])

In [14]:
pipe.fit(X,y)
coef = pd.DataFrame({'Feature':list(np.array(X.columns)[pipe.steps[0][1].mask_]), 'Coef':list(pipe.steps[2][1].coef_[0])})
coef = coef.sort_values(by='Coef', ascending=False, key=abs)
coef = coef.loc[coef.Coef != 0,:]
coef

Unnamed: 0,Feature,Coef
4,firstorder_90Percentile,-0.690006
16,glrlm_RunEntropy,0.595666
14,glcm_InverseVariance,0.191322
3,shape_Sphericity,-0.136674
2,shape_MeshVolume,0.079317
0,shape_Elongation,-0.070658
10,firstorder_RootMeanSquared,-0.061169
7,firstorder_Maximum,0.047391
20,glszm_SmallAreaEmphasis,0.046037


In [None]:
n_splits = 10
n_repeats = 1
validation = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)

# supress warnings for cross_validate
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"
#
cv = cross_validate(pipe, X, y, cv=validation, scoring='roc_auc', n_jobs=-1)
#
warnings.simplefilter('default')
os.environ["PYTHONWARNINGS"] = 'default'

print('AUROC (CV) = ' + str(np.round(np.mean(cv['test_score']),5)))