In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split
import os
import json
import time
import joblib

In [9]:
# get the start time
st = time.time()
file_name = 'T-OS_st1-2_rad_dataset-corr'
#Load the data
main_dir= "/home/ulaval.ca/lesee/projects/Project2-synergiqc/OS/"


data = pd.read_excel(os.path.join(main_dir,'data/T-SynergiQc_annotated-clinical-radiomics1713-harmonized.xlsx'))
# Identify and drop columns with all NaN values
#data.dropna(axis=1, how='all', inplace=True)

# Filter the data for stage
filtered_data = data[data['Stage'].isin([1,2])]

# Explicitly create a copy of the data
filtered_data_copy = filtered_data.copy()

filtered_data_copy.drop(columns=['PatientName', 'PatientID', 'StudyInstanceUID', 'Stage', 'Recurrence', 'PFS-months', 'PFS-days'], inplace=True)

#Remove rows with missing data
filtered_data_copy.dropna(inplace=True)
                                            
features = filtered_data_copy.drop(columns=['OS-months', 'OS-days'])
print("number of features = ", np.shape(features))

number of features =  (1000, 1408)


In [11]:
# Set the target variable for stage 1 and stage 2
target = filtered_data_copy['OS-months']
#event = filtered_data_copy['VitalStatus']
#print("number of features = ", np.shape(features))

In [12]:
#Remove constant radiomic features
#constant_features = features.columns[features.nunique() == 1]
#features.drop(constant_features, axis=1, inplace=True)
features = features.loc[:, features.var() != 0.0]

# Store the feature names
feature_names = features.columns.tolist()
print(len(feature_names))
#print(feature_names)
#print(features)


1313


In [13]:
# Define the list of clinical columns to drop
columns_clinical_to_drop = ['Smoking', 'Age', 'Subtype', 'Sex', 'VitalStatus']
# Filter LoG features 
log_features = [feature for feature in feature_names if "log-sigma" in feature ]


# Filter intensity_based features from original image
intensity_original_features = [feature for feature in feature_names if "_firstorder_" in feature and feature.startswith("original")]
#print(intensity_original_features)
# Filter texture_based features from original image
texture_original_features = [feature for feature in feature_names if ("_glcm_" in feature or "_gldm_" in feature or "_glrlm_" in feature or "_glszm_" in feature) and feature.startswith("original")]
# Filter wavelet_based features
wavelet_features = [feature for feature in feature_names if "wavelet" in feature]
# Filter intensity_based features
intensity_features = [feature for feature in feature_names if "_firstorder_" in feature]
# Filter texture_based features
texture_features = [feature for feature in feature_names if ("_glcm_" in feature or "_gldm_" in feature or "_glrlm_" in feature or "_glszm_" in feature)]
# Filter shape_based features
shape_features = [feature for feature in feature_names if "_shape_" in feature]

X_rads = features
print(np.shape(X_rads))
print(len(log_features))

(1000, 1313)
172


In [14]:
#select desired features from the whole dataset based on the type of radiomics
X_rads_intensity_original = X_rads[intensity_original_features]
X_rads_texture_original = X_rads[texture_original_features]
X_rads_wavelet = X_rads[wavelet_features]
X_rads_intensity = X_rads[intensity_features]
X_rads_texture = X_rads[texture_features]
X_rads_shape = X_rads[shape_features]
X_rads_LoG = X_rads[log_features]
X_clinical = X_rads[columns_clinical_to_drop]
X_rads_clinical = np.concatenate((X_rads_LoG, X_clinical), axis=1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X_rads_clinical, target, test_size=0.3, random_state=42)
# Drop the last four columns (assuming they are clinical data)
X_train_filtered = X_train[:, :-5]
X_test_filtered = X_test[:, :-5]
# Keep only the last four columns (assuming they are clinical data)
X_train_clinical = X_train[:, -5:]
X_test_clinical = X_test[:, -5:]

#print(X_rads_LoG)
print("Shape of the training features =", np.shape(X_train_filtered))
print("Shape of the test features =", np.shape(X_test_filtered))

Shape of the training features = (700, 172)
Shape of the test features = (300, 172)


In [15]:
# Assuming X_train_filtered as a NumPy array
corr_matrix = np.corrcoef(X_train_filtered, rowvar=False)

# Select upper triangle of correlation matrix
upper = np.triu(corr_matrix, k=1)

# Find indices of features with correlation greater than 0.75 (for higher number of features) or 0.90 (for less number of features) which may be dropped
columns_to_drop_indices = np.where(np.abs(upper) > 0.90)
#print(columns_to_drop_indices[1])
#print(np.unique(columns_to_drop_indices[1]))

# Drop the desired features among the ones with high correlation from training dataset
X_train_filtered_1 = np.delete(X_train_filtered, np.unique(columns_to_drop_indices[1]), axis=1)

print("Shape of the training features after removing correlated features =", X_train_filtered_1.shape)


Shape of the training features after removing correlated features = (700, 74)


In [16]:
# Drop the desired features among the ones with high correlation from test set
X_test_filtered_1 = np.delete(X_test_filtered, np.unique(columns_to_drop_indices[1]), axis=1)

print("Shape of the testing features after removing correlated features =", X_test_filtered_1.shape)

Shape of the testing features after removing correlated features = (300, 74)


In [17]:
scaler = preprocessing.StandardScaler().fit(X_train_filtered_1)
X_train_scaled = scaler.transform(X_train_filtered_1)
X_test_scaled = scaler.transform(X_test_filtered_1)

print(np.shape(X_train_scaled))
print(np.shape(X_test_scaled))

print(np.shape(X_train_clinical))
print(np.shape(X_test_clinical))

(700, 74)
(300, 74)
(700, 5)
(300, 5)


In [38]:
y_train = y_train.values
y_test = y_test.values

X_train_clinical_df = pd.DataFrame(X_train_clinical)
X_test_clinical_df = pd.DataFrame(X_test_clinical)
# Save the clinical data for training and test sets to CSV files
X_train_clinical_df.to_csv(os.path.join(main_dir,'data/T-train_data_os_st1_2_clinical_corr_log_harmonized.csv'), index=False, float_format='%.7f')
X_test_clinical_df.to_csv(os.path.join(main_dir,'data/T-test_data_os_st1_2_clinical_corr_log_harmonized.csv'), index=False, float_format='%.7f')

X_train_df = pd.DataFrame(X_train_scaled)
y_train_df = pd.DataFrame({'y_train': y_train})

X_test_df = pd.DataFrame(X_test_scaled)
y_test_df = pd.DataFrame({'y_test': y_test})

# Concatenate the DataFrames horizontally (side by side)
combined_df_train = pd.concat([X_train_df, y_train_df], axis=1)
combined_df_test = pd.concat([X_test_df, y_test_df], axis=1)

# Save the combined DataFrame to a CSV file
combined_df_train.to_csv(os.path.join(main_dir,'data/T-train_data_os_st1_2_rad_corr_log_harmonized.csv'), index=False, float_format='%.7f')
combined_df_test.to_csv(os.path.join(main_dir,'data/T-test_data_os_st1_2_rad_corr_log_harmonized.csv'), index=False, float_format='%.7f')