In [1]:
import numpy as np
import pandas as pd
# pd.set_option('display.max_columns', 500, 'display.max_rows', 500)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_validate
import os, warnings, copy, sys
import matplotlib.pyplot as plt
import dill

sys.path.append('/data/users/morton/git/icrpythonradiomics/machineLearning')
from featureSelection import featureSelection_correlation, featureSelection_groupName


In [2]:
# function to make group strings easier to read
def groupStrsDisp(strGroups):
    
    def tidyGroup(strGroup):
        if strGroup == '':
            strGroup = 'all'
        strGroup = strGroup.replace('glcm|gldm|glszm|glrlm|ngtdm','texture')
        strGroup = strGroup.replace('firstorder|histogram','firstorder')
        return strGroup

    if isinstance(strGroups, str):
        return tidyGroup(strGroups)
    
    if isinstance(strGroups, list):
        return [tidyGroup(x) for x in strGroups]

In [3]:
# open clinical spreadsheet
clinicalSpreadsheet = '/Users/morton/Dicom Files/RADSARC_R/ClinicalData/Clinical data for analysis.xlsx'
dfClinical = pd.read_excel(clinicalSpreadsheet, sheet_name='220818_Completed segs', engine='openpyxl')
dfClinical = dfClinical[['Anon Code', 'Grade', 'subtype']]

In [4]:
# open radiomics data
dfRad = pd.read_csv('/Users/morton/Dicom Files/RADSARC_R/XNAT/extractions/extractions__20220910_1006_allRegions/radiomicFeatures/radiomicFeatures.csv')
dfRad.drop(list(dfRad.filter(regex = 'source')), axis = 1, inplace = True)
dfRad.drop(list(dfRad.filter(regex = 'diagnostic')), axis = 1, inplace = True)
dfRad.drop(list(dfRad.filter(regex = 'histogram')), axis = 1, inplace = True)

# select only the standard feature set from the whole lesion
# featureSet = 'lesion_sarcomaFeature|lesion_original_shape|lesion_original_firstorder'
featureSet = 'lesion_sarcomaFeature|lesion_original_'
dfRad = dfRad.filter(regex=featureSet + '|StudyPatientName')

dfRad.rename(lambda x:x.replace('lesion_',''), axis=1, inplace=True)
dfRad.rename(lambda x:x.replace('original_',''), axis=1, inplace=True)
dfRad.rename(lambda x:x.replace(' ','_'), axis=1, inplace=True)

In [5]:
# merge clinical and radiomics data into single frame
df = dfClinical.merge(dfRad, left_on='Anon Code', right_on='StudyPatientName')
df.drop('Anon Code', axis=1, inplace=True)
df.drop('StudyPatientName', axis=1, inplace=True)
df.drop('Grade', axis=1, inplace=True)
target = 'subtype'

# all cases with sarcomaFeature_low_enhancingVolumeFraction>0 are LMS, so stratify and only fit cases
# with sarcomaFeature_low_enhancingVolumeFraction == 0
df = df.loc[df['sarcomaFeature_low_enhancingVolumeFraction']==0,:]
df.drop('sarcomaFeature_low_enhancingVolumeFraction', axis=1, inplace=True)

# df.drop('sarcomaFeature_mid_enhancingVolumeFraction', axis=1, inplace=True)

In [6]:
X = df.drop(target, axis=1)
y = df[target]

correlationHierarchy = ['sarcomaFeature', 'shape']

textureStr = 'glcm|gldm|glszm|glrlm|ngtdm'
groupHierarchy = ['sarcomaFeature|shape', 'firstorder', textureStr, 
                  'sarcomaFeature|shape|firstorder', 'sarcomaFeature|shape|'+textureStr,
                  'firstorder|'+textureStr, '']

pipe = Pipeline([('correlationSelector', featureSelection_correlation(threshold=0.9, exact=False, featureGroupHierarchy=correlationHierarchy)),
                 ('groupSelector', featureSelection_groupName()),
                 ('scaler', StandardScaler()),
                 ('lr', LogisticRegression(solver="liblinear", max_iter=10000, penalty='l1'))])

p_grid = {"lr__C": np.logspace(np.log10(0.01), np.log10(100), 20),
          "groupSelector__groupFilter": groupHierarchy}

In [7]:
random_state = 42
np.random.seed(random_state)

inner_cv = StratifiedKFold(n_splits=5)
model = GridSearchCV(estimator=pipe, param_grid=p_grid, cv=inner_cv, refit=True, verbose=0, scoring='neg_log_loss', n_jobs=-1)
model.fit(X, y)

# set to 1 ready for using n_jobs = -1 for cross validation
model.n_jobs = 1

n_splits = 10
n_repeats = 2
validation = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)

# supress warnings for cross_validate
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"
#
cv_result = cross_validate(model, X, y, cv=validation, scoring='roc_auc', return_estimator=True, n_jobs=-1)
#
warnings.simplefilter('default')
os.environ["PYTHONWARNINGS"] = 'default'

cv_mean = np.round(np.mean(cv_result['test_score']),3)
cv_std = np.round(np.std(cv_result['test_score']),4)

print('AUROC = ' + str(cv_mean) + ' \u00B1 ' + str(cv_std))


KeyboardInterrupt: 

In [None]:
# from fit to all data get the best feature group combination
groupStrsDisp(model.best_estimator_.steps[1][1].groupFilter)

In [None]:
# from fit to all data get the non-zero LR coefficients
colMask0 = copy.deepcopy(model.best_estimator_.steps[0][1].mask_)
colMask1 = model.best_estimator_.steps[1][1].colMask_
colMask0[colMask0] = colMask1
bc = np.zeros((X.shape[1]))
bc[colMask0] = copy.deepcopy(model.best_estimator_._final_estimator.coef_).ravel()
bestCoef = pd.DataFrame({'Feature':list(np.array(X.columns)[bc != 0]), 'Coef':list(bc[bc != 0])})
bestCoef = bestCoef.sort_values(by='Coef', ascending=False, key=abs)
bestCoef = bestCoef.loc[bestCoef.Coef != 0,:]
bestCoef.style.hide_index()

In [None]:
# get frequency that each feature is selected

bc = np.zeros((len(cv_result['estimator']), X.shape[1]))
for n, ecv in enumerate(cv_result['estimator']):
    colMask0 = copy.deepcopy(ecv.best_estimator_.steps[0][1].mask_)
    colMask1 = ecv.best_estimator_.steps[1][1].colMask_
    colMask0[colMask0] = colMask1
    bc[n, colMask0] = copy.deepcopy(ecv.best_estimator_._final_estimator.coef_)
    coef = pd.DataFrame({'Feature':list(np.array(X.columns)[bc[n,:].ravel() != 0]), 'Coef':list(bc[n,bc[n,:] != 0])})
    coef = coef.sort_values(by='Coef', ascending=False, key=abs)
    coef = coef.loc[coef.Coef != 0,:]

pdFreq = pd.DataFrame({'Feature':X.columns, 'Frequency':np.sum(bc !=0, axis=0)/(n_splits*n_repeats)*100})

# add coeff values for best fit
pdFreq['Coef'] = ''
for _, row in bestCoef.iterrows():
    rowIndex = pdFreq.index[pdFreq.Feature == row.Feature].tolist()[0]
    pdFreq.loc[rowIndex,'Coef'] = row.Coef

pdFreq = pdFreq.loc[pdFreq.Frequency>0,:].sort_values(by='Frequency', ascending=False, key=abs)

pd.set_option('display.precision', 2)
pdFreq.style.hide_index()

In [None]:
C = []
for est in cv_result['estimator']:
    C.append(copy.deepcopy(est.best_estimator_.steps[3][1].C))
np.round(C,3)

In [None]:
pd.DataFrame(data=bc, columns=X.columns).style.hide_index()

In [None]:
group_cv_counts = {}
group_cv = [groupStrsDisp(x.best_estimator_.steps[1][1].groupFilter) for x in cv_result['estimator']]
groupCounts = [0]*len(groupHierarchy)
for n, group in enumerate(groupStrsDisp(groupHierarchy)):
    groupCounts[n] = len([x for x in group_cv if x==group])
#groupCounts = [x/np.sum(groupCounts) for x in groupCounts]
pd.DataFrame(columns=groupStrsDisp(groupHierarchy), data=[groupCounts]).style.hide_index()

In [None]:
[np.sum(df.subtype[df['sarcomaFeature_low_enhancingVolumeFraction']==0]=='LMS'), np.sum(df.subtype[df['sarcomaFeature_low_enhancingVolumeFraction']==0]=='LPS')]