# Retroperitoneal sarcoma radiomics study
# Prediction of tumour type and grade
# Logistic regression with group selection

In [5]:
import os
print(os.path.expanduser("~"))
user

/Users/adminehann


NameError: name 'user' is not defined

In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500, 'display.max_rows', 500, 'display.precision', 3)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_validate
import os, warnings, copy, sys
import matplotlib.pyplot as plt
import dill
import pickle

user = os.path.expanduser("~")
rootFolder = user + 'Documents/Matt/radsarc'

sys.path.append(user + 'git/git_icrpythonradiomics/machineLearning')
from featureSelection import featureSelection_correlation, featureSelection_groupName

from fit_LR_groupSelection_correlationThresholds import fit_LR_groupSelection_correlationThresholds, displayResultExperiments, displayOneExperiment


### Utility functions

In [None]:
def groupStrsDisp(strGroups):
    
    def tidyGroup(strGroup):
        if strGroup == '':
            strGroup = 'all'
        strGroup = strGroup.replace('glcm|gldm|glszm|glrlm|ngtdm','texture')
        strGroup = strGroup.replace('firstorder|histogram','firstorder')
        return strGroup

    if isinstance(strGroups, str):
        return tidyGroup(strGroups)
    
    if isinstance(strGroups, list):
        return [tidyGroup(x) for x in strGroups]

## Load data

In [None]:
# open clinical spreadsheet
clinicalSpreadsheet = os.path.join(rootFolder, 'Clinical data for analysis.xlsx')
dfClinical = pd.read_excel(clinicalSpreadsheet, sheet_name='220818_Completed segs', engine='openpyxl')
dfClinical = dfClinical[['Anon Code', 'Grade', 'subtype']]

In [None]:
# open radiomics data
dfRad = pd.read_csv(os.path.join(rootFolder, 'extractions/extractions__20220910_1006_allRegions/radiomicFeatures/radiomicFeatures.csv'))
dfRad.drop(list(dfRad.filter(regex = 'source')), axis = 1, inplace = True)
dfRad.drop(list(dfRad.filter(regex = 'diagnostic')), axis = 1, inplace = True)
dfRad.drop(list(dfRad.filter(regex = 'histogram')), axis = 1, inplace = True)

## Run classifications

In [None]:
quickLoadFolder = os.path.join(rootFolder, 'experiments/LR_GroupSelection')

### Tumour type: LMS v.s LPS
### Standard radiomics features

In [None]:
# merge clinical and radiomics data into single frame
df = dfClinical.merge(dfRad, left_on='Anon Code', right_on='StudyPatientName')
df.drop('Anon Code', axis=1, inplace=True)
df.drop('StudyPatientName', axis=1, inplace=True)

target = 'subtype'
featureSet = 'lesion_original'

df = df.filter(regex=featureSet + '|' + target)

In [None]:
settings = {'n_splits':10, 
            'n_repeats':20,
            'thresholds':np.round(np.arange(0.6,1.00001,0.01), 2)}

textureStr = 'glcm|gldm|glszm|glrlm|ngtdm'
settings['groupHierarchy'] = ['shape',
                              'firstorder',
                              textureStr,
                              'shape|firstorder',
                              'shape|' + textureStr,
                              'firstorder|' + textureStr,
                              '']

quickLoadFile = os.path.join(quickLoadFolder, 'LR_GroupSelection_Standard_TumourType.pickle')

if os.path.exists(quickLoadFile):
    with open(quickLoadFile, 'rb') as handle:
        result = pickle.load(handle)
else:
    result = fit_LR_groupSelection_correlationThresholds(df, target, settings)

#     with open(quickLoadFile, 'wb') as handle:
#         pickle.dump(result, handle)
    
resultStandardType = copy.deepcopy(result)

del result
    

In [None]:
displayResultExperiments(resultStandardType, titleStr=target + ': LMS vs. LPS, standard radiomics features')
bestCoef, pdFreq = displayOneExperiment(resultStandardType, threshold=0.8)
# bestCoef.style.hide_index()
pdFreq.style.hide_index()

### Tumour type: LMS v.s LPS
### Volume fraction and standard radiomics features

In [None]:
# merge clinical and radiomics data into single frame
df = dfClinical.merge(dfRad, left_on='Anon Code', right_on='StudyPatientName')
df.drop('Anon Code', axis=1, inplace=True)
df.drop('StudyPatientName', axis=1, inplace=True)

target = 'subtype'
featureSet = 'lesion_original|sarcomaFeature'

df = df.filter(regex=featureSet + '|' + target)

In [None]:
settings = {'n_splits':10, 
            'n_repeats':20,
            'thresholds':np.round(np.arange(0.6,1.00001,0.01), 2)}

quickLoadFile = os.path.join(quickLoadFolder, 'LR_GroupSelection_VolumeFractions_TumourType.pickle')

if os.path.exists(quickLoadFile):
    with open(quickLoadFile, 'rb') as handle:
        result = pickle.load(handle)
else:
    result = fit_LR_groupSelection_correlationThresholds(df, target, settings)

#     with open(quickLoadFile, 'wb') as handle:
#         pickle.dump(result, handle)

resultVolFracType = copy.deepcopy(result)

del result
    

In [None]:
displayResultExperiments(resultVolFracType, titleStr = target + ': LMS vs. LPS, volume fraction + standard radiomics features')

bestCoef, pdFreq = displayOneExperiment(resultVolFracType, threshold=0.8)
# bestCoef.style.hide_index()
pdFreq.style.hide_index()

### Tumour grade: 1 v.s 2 or 3
### Standard radiomics features

In [None]:
# merge clinical and radiomics data into single frame
df = dfClinical.merge(dfRad, left_on='Anon Code', right_on='StudyPatientName')
df.drop('Anon Code', axis=1, inplace=True)
df.drop('StudyPatientName', axis=1, inplace=True)

target = 'Grade1vs23'
featureSet = 'lesion_original'

df['Grade1vs23'] = df['Grade'] == 1

df = df.filter(regex=featureSet + '|' + target)

In [None]:
settings = {'n_splits':10, 
            'n_repeats':20,
            'thresholds':np.round(np.arange(0.6,1.00001,0.01), 2)}

textureStr = 'glcm|gldm|glszm|glrlm|ngtdm'
settings['groupHierarchy'] = ['shape',
                              'firstorder',
                              textureStr,
                              'shape|firstorder',
                              'shape|' + textureStr,
                              'firstorder|' + textureStr,
                              '']

quickLoadFile = os.path.join(quickLoadFolder, 'LR_GroupSelection_Standard_Grade1vs23.pickle')

if os.path.exists(quickLoadFile):
    with open(quickLoadFile, 'rb') as handle:
        result = pickle.load(handle)
else:
    result = fit_LR_groupSelection_correlationThresholds(df, target, settings)

#     with open(quickLoadFile, 'wb') as handle:
#         pickle.dump(result, handle)

resultStandardGrade_1vs23 = copy.deepcopy(result)

del result
    

In [None]:
displayResultExperiments(resultStandardGrade_1vs23, titleStr = 'grade 1 vs. 2 or 3 standard radiomics features')

bestCoef, pdFreq = displayOneExperiment(resultStandardGrade_1vs23, threshold=0.8)
# bestCoef.style.hide_index()
pdFreq.style.hide_index()

### Tumour grade: 1 or 2 v.s 3
### Standard radiomics features

In [None]:
# merge clinical and radiomics data into single frame
df = dfClinical.merge(dfRad, left_on='Anon Code', right_on='StudyPatientName')
df.drop('Anon Code', axis=1, inplace=True)
df.drop('StudyPatientName', axis=1, inplace=True)

target = 'Grade12vs3'
featureSet = 'lesion_original'

df[target] = df['Grade'] != 3

df = df.filter(regex=featureSet + '|' + target)

In [None]:
settings = {'n_splits':10, 
            'n_repeats':20,
            'thresholds':np.round(np.arange(0.6,1.00001,0.01), 2)}

textureStr = 'glcm|gldm|glszm|glrlm|ngtdm'
settings['groupHierarchy'] = ['shape',
                              'firstorder',
                              textureStr,
                              'shape|firstorder',
                              'shape|' + textureStr,
                              'firstorder|' + textureStr,
                              '']

quickLoadFile = os.path.join(quickLoadFolder, 'LR_GroupSelection_Standard_Grade12vs3.pickle')

if os.path.exists(quickLoadFile):
    with open(quickLoadFile, 'rb') as handle:
        result = pickle.load(handle)
else:
    result = fit_LR_groupSelection_correlationThresholds(df, target, settings)

#     with open(quickLoadFile, 'wb') as handle:
#         pickle.dump(result, handle)

resultStandardGrade_12vs3 = copy.deepcopy(result)

del result
    

In [None]:
displayResultExperiments(resultStandardGrade_12vs3, titleStr = 'grade 1 or 2 vs. 3 standard radiomics features')

bestCoef, pdFreq = displayOneExperiment(resultStandardGrade_12vs3, threshold=0.8)
# bestCoef.style.hide_index()
pdFreq.style.hide_index()

### Tumour grade: 1 v.s 2 or 3
### Volume fractions and standard radiomics features

In [None]:
# merge clinical and radiomics data into single frame
df = dfClinical.merge(dfRad, left_on='Anon Code', right_on='StudyPatientName')
df.drop('Anon Code', axis=1, inplace=True)
df.drop('StudyPatientName', axis=1, inplace=True)

target = 'Grade1vs23'
featureSet = 'lesion_original|sarcomaFeature'

df['Grade1vs23'] = df['Grade'] == 1

df = df.filter(regex=featureSet + '|' + target)

In [None]:
settings = {'n_splits':10, 
            'n_repeats':20,
            'thresholds':np.round(np.arange(0.6,1.00001,0.01), 2)}

quickLoadFile = os.path.join(quickLoadFolder, 'LR_GroupSelection_VolumeFractions_Grade1vs23.pickle')

if os.path.exists(quickLoadFile):
    with open(quickLoadFile, 'rb') as handle:
        result = pickle.load(handle)
else:
    result = fit_LR_groupSelection_correlationThresholds(df, target, settings)

    with open(quickLoadFile, 'wb') as handle:
        pickle.dump(result, handle)

resultVolumeFractionGrade_1vs23 = copy.deepcopy(result)

del result
    

In [None]:
displayResultExperiments(resultVolumeFractionGrade_1vs23, titleStr = 'grade 1 vs. 2 or 3 volume fractions + standard radiomics features')

bestCoef, pdFreq = displayOneExperiment(resultVolumeFractionGrade_1vs23, threshold=0.8)
# bestCoef.style.hide_index()
pdFreq.style.hide_index()

### Tumour grade: 1 or 2 v.s 3
### Volume fractions + standard radiomics features 

In [None]:
# merge clinical and radiomics data into single frame
df = dfClinical.merge(dfRad, left_on='Anon Code', right_on='StudyPatientName')
df.drop('Anon Code', axis=1, inplace=True)
df.drop('StudyPatientName', axis=1, inplace=True)

target = 'Grade12vs3'
featureSet = 'lesion_original|sarcomaFeature'

df[target] = df['Grade'] != 3

df = df.filter(regex=featureSet + '|' + target)

In [None]:
settings = {'n_splits':10, 
            'n_repeats':20,
            'thresholds':np.round(np.arange(0.6,1.00001,0.01), 2)}

quickLoadFile = os.path.join(quickLoadFolder, 'LR_GroupSelection_VolumeFractions_Grade12vs3.pickle')

if os.path.exists(quickLoadFile):
    with open(quickLoadFile, 'rb') as handle:
        result = pickle.load(handle)
else:
    result = fit_LR_groupSelection_correlationThresholds(df, target, settings)

    with open(quickLoadFile, 'wb') as handle:
        pickle.dump(result, handle)

resultVolumeFractionGrade_12vs3 = copy.deepcopy(result)

del result
    

In [None]:
displayResultExperiments(resultVolumeFractionGrade_12vs3, titleStr = 'grade 1 or 2 vs. 3 volume fraction + standard radiomics features'))

bestCoef, pdFreq = displayOneExperiment(resultVolumeFractionGrade_12vs3, threshold=0.8)
# bestCoef.style.hide_index()
pdFreq.style.hide_index()