In [30]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RepeatedKFold, KFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_regression
from skrebate import ReliefF, SURF, MultiSURF
from sklearn.linear_model import Lasso, Ridge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (DotProduct, WhiteKernel, RBF, Matern, ConstantKernel, ExpSineSquared, RationalQuadratic, Product)
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
#from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr
from imblearn.under_sampling import RandomUnderSampler
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
#from skopt import BayesSearchCVba
import os
import json
import time
import joblib

In [64]:
# get the start time
st = time.time()
file_name = 'T-OS_st1-2_dataset-corr'
#Load the data
main_dir= "/home/ulaval.ca/lesee/projects/Project2-synergiqc/OS/"

data = pd.read_excel(os.path.join(main_dir,'data/T-SynergiQc_annotated-clinical-radiomics1713-normal.xlsx'))
# Identify and drop columns with all NaN values
#data.dropna(axis=1, how='all', inplace=True)

# Filter the data for stage
filtered_data = data[data['Stage'].isin([1,2])]

# Explicitly create a copy of the data
filtered_data_copy = filtered_data.copy()

filtered_data_copy.drop(columns=['PatientName', 'PatientID', 'StudyInstanceUID', 'Stage',  'Recurrence', 'PFS-months', 'PFS-days'], inplace=True)

#Remove rows with missing data
filtered_data_copy.dropna(inplace=True)
                                            
features = filtered_data_copy.drop(columns=['OS-months', 'OS-days'])
print("number of features = ", np.shape(features))

number of features =  (1000, 1235)


In [65]:
# Set the target variable for stage 1 and stage 2
target = filtered_data_copy['OS-months']
#event = filtered_data_copy['VitalStatus']
#print("number of features = ", np.shape(features))

In [66]:
#Remove constant radiomic features
#constant_features = features.columns[features.nunique() == 1]
#features.drop(constant_features, axis=1, inplace=True)
features = features.loc[:, features.var() != 0.0]
#print("number of non-constant features = ", np.shape(features))
# Store the feature names
feature_names = features.columns.tolist()
#print(feature_names)


In [67]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(features, target, test_size=0.3, random_state=42)
# Define the list of columns to drop
columns_clinical_to_drop = ['Smoking', 'Age', 'Subtype', 'Sex']
# Drop the clinical data from X_train_scaled and X_test_scaled
X_train_filtered = X_train.drop(columns=columns_clinical_to_drop)
X_test_filtered = X_test.drop(columns=columns_clinical_to_drop)

#Feature selection based on training set
#remove all features that are constant 
#X_train_filtered = X_train_filtered.loc[:, X_train_filtered.var() != 0.0]

print("Shape of the training features after removing constant features =", np.shape(X_train_filtered))

Shape of the training features after removing constant features = (700, 1231)


In [68]:
#get correlations of each features in dataset and remove one of each highly correlated to each other 
corr_matrix = X_train_filtered.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find name of feature columns with correlation greater than 0.9 which may be dropped
column_to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]

#drop the desired fetures among the ones with high correlation 
X_train_filtered.drop(labels = column_to_drop, axis=1, inplace=True)
print("Shape of the training features after removing correlated features =", np.shape(X_train_filtered))

Shape of the training features after removing correlated features = (700, 197)


In [69]:
X_test_filtered = X_test_filtered[X_train_filtered.columns]
#print(X_test_filtered)

print("Shape of the test features after dropping high correlated features =", np.shape(X_test_filtered))

Shape of the test features after dropping high correlated features = (300, 197)


In [70]:
scaler = preprocessing.StandardScaler().fit(X_train_filtered)
X_train_scaled = scaler.transform(X_train_filtered)
X_test_scaled = scaler.transform(X_test_filtered)

print(np.shape(X_train_scaled))
print(np.shape(X_test_scaled))

# Extract the columns to be dropped from X_train_scaled and X_test_scaled
X_train_clinical = X_train[columns_clinical_to_drop]
X_test_clinical = X_test[columns_clinical_to_drop]


#scaler = preprocessing.StandardScaler().fit(X_train_clinical)
#X_train_clinical_scaled = scaler.transform(X_train_clinical)
#X_test_clinical_scaled = scaler.transform(X_test_clinical)
print(np.shape(X_train_clinical))
print(np.shape(X_test_clinical))

(700, 197)
(300, 197)
(700, 4)
(300, 4)


In [71]:
y_train = y_train.values
y_test = y_test.values

X_train_clinical_df = pd.DataFrame(X_train_clinical)
X_test_clinical_df = pd.DataFrame(X_test_clinical)
# Save the clinical data for training and test sets to CSV files
X_train_clinical_df.to_csv(os.path.join(main_dir,'data/T-train_data_os_st1_2_clinical_corr_harmonized.csv'), index=False, float_format='%.7f')
X_test_clinical_df.to_csv(os.path.join(main_dir,'data/T-test_data_os_st1_2_clinical_corr_harmonized.csv'), index=False, float_format='%.7f')

X_train_df = pd.DataFrame(X_train_scaled)
y_train_df = pd.DataFrame({'y_test': y_train})

X_test_df = pd.DataFrame(X_test_scaled)
y_test_df = pd.DataFrame({'y_test': y_test})

# Concatenate the DataFrames horizontally (side by side)
combined_df_train = pd.concat([X_train_df, y_train_df], axis=1)
combined_df_test = pd.concat([X_test_df, y_test_df], axis=1)

# Save the combined DataFrame to a CSV file
combined_df_train.to_csv(os.path.join(main_dir,'data/T-train_data_os_st1_2_rad_corr_harmonized.csv'), index=False, float_format='%.7f')
combined_df_test.to_csv(os.path.join(main_dir,'data/T-test_data_os_st1_2_rad_corr_harmonized.csv'), index=False, float_format='%.7f')