In [1]:
import os
import pandas as pd
import numpy as np
import random

#import logistic regression tools
from sklearn.linear_model import Lasso, Ridge

#import other classifier tools
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
# initialize output directory
output_dir = "./results"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [3]:
#load radiomics data
feat_data_dir = "./Final_ResampledClassificationData.csv"
feat_data = pd.read_csv(feat_data_dir)

In [4]:
#interrogate data if needed
#print(feat_data)

In [5]:
#split data to training and testing dataframe
train_df = feat_data[feat_data["Group"] == "train"]
val_df = feat_data[feat_data["Group"] == "val"]
test_df = feat_data[feat_data["Group"] == "test"]
merge_train = feat_data[feat_data["Group"] != "test"]

In [6]:
#select radiomics features from df (starting from original_shape_Elongation)
features = merge_train.columns.values[40:148]
print(features)

['original_shape_Elongation' 'original_shape_Flatness'
 'original_shape_LeastAxisLength' 'original_shape_MajorAxisLength'
 'original_shape_Maximum2DDiameterColumn'
 'original_shape_Maximum2DDiameterRow'
 'original_shape_Maximum2DDiameterSlice'
 'original_shape_Maximum3DDiameter' 'original_shape_MeshVolume'
 'original_shape_MinorAxisLength' 'original_shape_Sphericity'
 'original_shape_SurfaceArea' 'original_shape_SurfaceVolumeRatio'
 'original_shape_VoxelVolume' 'original_firstorder_10Percentile'
 'original_firstorder_90Percentile' 'original_firstorder_Energy'
 'original_firstorder_Entropy' 'original_firstorder_InterquartileRange'
 'original_firstorder_Kurtosis' 'original_firstorder_Maximum'
 'original_firstorder_MeanAbsoluteDeviation' 'original_firstorder_Mean'
 'original_firstorder_Median' 'original_firstorder_Minimum'
 'original_firstorder_Range'
 'original_firstorder_RobustMeanAbsoluteDeviation'
 'original_firstorder_RootMeanSquared' 'original_firstorder_Skewness'
 'original_firstor

In [7]:
#Get Respective Training and Test Predictor and Explanatory variable
X_train = merge_train.loc[:, features]
Y_train = merge_train.loc[:, "NE_Score"]

X_test = test_df.loc[:, features]
Y_test = test_df.loc[:,"NE_Score"]

In [8]:
#Use LASSO Regression to Select Features
clf = Lasso(alpha=0.35)
clf.fit(X_train, Y_train)

#Apply Test Set
results = clf.predict(X_test)
test_results = {"File" : test_df["File"].tolist(), "Predicted_NEScore" : results.tolist(), "True_NEScore": Y_test.tolist()}
#print("Predicted NE Scores for X_test:")
#print(test_results)

In [9]:
#Eval Performance on Test Data
clf.score(X_test, Y_test, sample_weight=None)
R2_value = np.corrcoef(Y_test, results)
print(R2_value)

[[1.         0.15739315]
 [0.15739315 1.        ]]


In [10]:
# Get Correlation Coefficient Matrix
coeff_mat = np.array(clf.coef_) #coefficient for each feature
features_select = {"Feature": features.tolist(), "CorrelationCoefficient": coeff_mat.tolist()}

In [11]:
#Get All Features Identified in Lasso
features_df = pd.DataFrame(features_select)
QOI = features_df[features_df["CorrelationCoefficient"] != 0]
print(QOI)

print("\n\nNumber of relevant features: ")
print(len(QOI["Feature"].tolist()))

                                               Feature  CorrelationCoefficient
3                       original_shape_MajorAxisLength            7.589864e-05
5                  original_shape_Maximum2DDiameterRow            2.964812e-04
8                            original_shape_MeshVolume           -4.984148e-07
9                       original_shape_MinorAxisLength            4.214767e-04
11                          original_shape_SurfaceArea           -3.287866e-08
13                          original_shape_VoxelVolume           -3.788453e-08
16                          original_firstorder_Energy           -4.318938e-12
29                     original_firstorder_TotalEnergy           -9.429379e-14
31                        original_firstorder_Variance           -3.450927e-05
57               original_gldm_DependenceNonUniformity            1.293782e-05
60                original_gldm_GrayLevelNonUniformity           -1.954947e-07
63               original_gldm_LargeDependenceEmphas

In [12]:
#look at top 5 (raw coefficient)
ind = np.argpartition(coeff_mat, -5)[-5:]
top5_values = coeff_mat[ind]
top5_labels = features[ind]
print(top5_values)
print(top5_labels)

[1.29378240e-05 7.41156063e-05 7.58986404e-05 2.96481240e-04
 4.21476657e-04]
['original_gldm_DependenceNonUniformity'
 'original_gldm_LargeDependenceEmphasis' 'original_shape_MajorAxisLength'
 'original_shape_Maximum2DDiameterRow' 'original_shape_MinorAxisLength']


In [13]:
#look at correlation
print(coeff_mat)
magnitude_coeff = abs(coeff_mat)
print("\n\nLooking at magnitude only: ")
print(magnitude_coeff)

[ 0.00000000e+00 -0.00000000e+00 -0.00000000e+00  7.58986404e-05
 -0.00000000e+00  2.96481240e-04 -0.00000000e+00 -0.00000000e+00
 -4.98414806e-07  4.21476657e-04  0.00000000e+00 -3.28786635e-08
 -0.00000000e+00 -3.78845317e-08  0.00000000e+00 -0.00000000e+00
 -4.31893830e-12 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -9.42937866e-14  0.00000000e+00 -3.45092704e-05
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  1.29378240e-05 -0.00000000e+00  0.00000000e+00
 -1.95494749e-07 -0.00000

In [14]:
#look at top 5 (abs value of coefficient)
ind = np.argpartition(magnitude_coeff, -5)[-5:]
top5_values = coeff_mat[ind]
top5_labels = features[ind]
print(top5_values)
print(top5_labels)

[-3.45092704e-05  7.41156063e-05  7.58986404e-05  2.96481240e-04
  4.21476657e-04]
['original_firstorder_Variance' 'original_gldm_LargeDependenceEmphasis'
 'original_shape_MajorAxisLength' 'original_shape_Maximum2DDiameterRow'
 'original_shape_MinorAxisLength']


In [15]:
#save Selected features
QOI.to_csv(os.path.join(output_dir, "LASSO_SelectedFeatures.csv"), index_label = "FeatureData_Index") 

In [16]:
print(ind)

[31 63  3  5  9]


In [17]:
Q5 = features_df.iloc[ind]
print(Q5)
Q5.to_csv(os.path.join(output_dir, "LASSO_SelectedFeatures_top5.csv"), index_label = "FeatureData_Index")

                                  Feature  CorrelationCoefficient
31           original_firstorder_Variance               -0.000035
63  original_gldm_LargeDependenceEmphasis                0.000074
3          original_shape_MajorAxisLength                0.000076
5     original_shape_Maximum2DDiameterRow                0.000296
9          original_shape_MinorAxisLength                0.000421
