In [None]:
# Importing the necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import mutual_info_classif

from sklearn.model_selection import StratifiedKFold

## Getting data and preprocessing it

In [2]:
# Read the LUAD Gene Expression Data
luad_ge = pd.read_csv('TCGA.LUAD.sampleMap_HiSeqV2', sep='\t', index_col='sample').T

# Read the LUAD Survival Data
luad_survival = pd.read_csv('TCGA-LUAD.survival.tsv', delimiter='\t')

# Read the LUAD Clinical Data
luad_ccv = pd.read_csv('clinical.tsv', delimiter='\t')

In [3]:
# Clean the clinical data to remove duplicates
luad_survival['sample'] = luad_survival['sample'].apply(lambda x:x[:-1])
luad_survival = luad_survival[~luad_survival['sample'].duplicated()]
luad_survival = luad_survival.reset_index()
luad_survival.drop(['index'], axis=1, inplace=True)
luad_survival.drop_duplicates(['_PATIENT'], keep='first', inplace=True)
luad_survival = luad_survival.reset_index()
luad_survival.drop(['index'], axis=1, inplace=True)
luad_survival

Unnamed: 0,sample,OS,_PATIENT,OS.time
0,TCGA-NJ-A4YI-01,1,TCGA-NJ-A4YI,4
1,TCGA-55-8506-11,0,TCGA-55-8506,11
2,TCGA-NJ-A55O-01,0,TCGA-NJ-A55O,13
3,TCGA-35-3615-01,0,TCGA-35-3615,14
4,TCGA-NJ-A55A-01,0,TCGA-NJ-A55A,15
...,...,...,...,...
504,TCGA-78-7143-01,1,TCGA-78-7143,4961
505,TCGA-49-AARR-11,0,TCGA-49-AARR,4992
506,TCGA-49-AARQ-11,0,TCGA-49-AARQ,6732
507,TCGA-78-8640-11,0,TCGA-78-8640,7062


In [4]:
# Clean the LUAD Clinical Data to remove duplicates
luad_ccv.drop_duplicates(['case_submitter_id'], keep='first', inplace=True)
luad_ccv

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_residence_at_enrollment,days_to_birth,days_to_death,...,treatment_arm,treatment_dose,treatment_dose_units,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type
0,cbf1f718-6bb7-4daf-b9d6-fb294281decb,TCGA-97-8172,TCGA-LUAD,75,'--,'--,'--,'--,-27416,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"
2,a905d275-9283-4fa6-bbbf-46019bd1bcb7,TCGA-78-8655,TCGA-LUAD,77,'--,'--,'--,'--,-28379,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"
4,1427cd18-5ad3-491a-9981-908e31ae49db,TCGA-91-6829,TCGA-LUAD,78,'--,'--,'--,'--,-28841,1258,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"
6,c0e263eb-1a83-4dc8-8abe-3dd2a59bae1b,TCGA-86-8672,TCGA-LUAD,59,'--,'--,'--,'--,-21682,19,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"
8,df576520-a6b6-4c9b-8d06-3f59cc5342fd,TCGA-75-7030,TCGA-LUAD,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1034,81a0b2ff-a3d3-41bb-9ce6-765e6ae894af,TCGA-64-1679,TCGA-LUAD,58,'--,'--,'--,'--,-21310,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"
1036,aee86a89-0377-4080-b16c-408bfbe78687,TCGA-69-7980,TCGA-LUAD,70,'--,'--,'--,'--,-25583,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Pharmaceutical Therapy, NOS"
1038,4ef872e1-82c9-4939-9248-41ed9d3085b2,TCGA-78-7145,TCGA-LUAD,52,'--,'--,'--,'--,-19080,826,...,'--,'--,'--,'--,'--,'--,'--,yes,'--,"Pharmaceutical Therapy, NOS"
1040,42432463-8e92-4f25-b72a-f03953527aa5,TCGA-O1-A52J,TCGA-LUAD,74,'--,'--,'--,'--,-27223,1798,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"


In [5]:
# Clinical Feature Selection (Manual)
clinical_feat = [
'case_submitter_id',
'age_at_index',
'gender',
'ajcc_pathologic_m',
'ajcc_pathologic_n',
'ajcc_pathologic_t',
'primary_diagnosis',
'prior_malignancy',
'tissue_or_organ_of_origin',
'treatment_type',
]

In [6]:
# Get only the cohort of patients with clinical features and survival data
luad_ccv_cohort = luad_ccv[luad_ccv['case_submitter_id'].isin(luad_survival['_PATIENT'])].copy(deep=True)
luad_ccv_cohort.replace("'--", np.nan, inplace=True)
luad_ccv_cohort = luad_ccv_cohort.loc[:, clinical_feat]
luad_ccv_cohort = luad_ccv_cohort.reset_index()
luad_ccv_cohort.drop(['index'], axis=1, inplace=True)
luad_ccv_cohort

Unnamed: 0,case_submitter_id,age_at_index,gender,ajcc_pathologic_m,ajcc_pathologic_n,ajcc_pathologic_t,primary_diagnosis,prior_malignancy,tissue_or_organ_of_origin,treatment_type
0,TCGA-97-8172,75,female,M0,N0,T2a,Acinar cell carcinoma,yes,"Upper lobe, lung","Radiation Therapy, NOS"
1,TCGA-78-8655,77,female,M0,N0,T1,Adenocarcinoma with mixed subtypes,no,"Lower lobe, lung","Radiation Therapy, NOS"
2,TCGA-91-6829,78,male,MX,N0,T2,"Bronchiolo-alveolar carcinoma, non-mucinous",no,"Lower lobe, lung","Radiation Therapy, NOS"
3,TCGA-86-8672,59,male,M0,N0,T3,"Adenocarcinoma, NOS",no,"Upper lobe, lung","Radiation Therapy, NOS"
4,TCGA-62-8398,55,male,M0,N2,T2,Adenocarcinoma with mixed subtypes,no,"Lower lobe, lung","Pharmaceutical Therapy, NOS"
...,...,...,...,...,...,...,...,...,...,...
504,TCGA-64-1679,58,female,M0,N2,T1,"Adenocarcinoma, NOS",no,"Upper lobe, lung","Radiation Therapy, NOS"
505,TCGA-69-7980,70,female,M0,N0,T1b,"Adenocarcinoma, NOS",no,"Upper lobe, lung","Pharmaceutical Therapy, NOS"
506,TCGA-78-7145,52,female,M1,N1,T4,"Adenocarcinoma, NOS",no,"Middle lobe, lung","Pharmaceutical Therapy, NOS"
507,TCGA-O1-A52J,74,female,MX,N0,T1,"Adenocarcinoma, NOS",no,"Lower lobe, lung","Radiation Therapy, NOS"


In [7]:
# Save the clinical features in a file
clinic_feat = luad_ccv_cohort.columns.tolist()
file = open(f'data_experiments_SMOTE/clinical_feature_names.txt','w')
for item in clinic_feat:
    file.write(item+"\n")
file.close()

In [9]:
# Convert categorical columns into one hot encoded vectors
luad_ccv_cohort_ohe = luad_ccv_cohort.copy(deep=True)

le_list = ['gender',
'ajcc_pathologic_m',
'ajcc_pathologic_n',
'ajcc_pathologic_t',
'primary_diagnosis',
'prior_malignancy',
'tissue_or_organ_of_origin',
'treatment_type']

le = LabelEncoder()

class_label_list = {}

for col in le_list:
    luad_ccv_cohort_ohe.loc[:, col] = le.fit_transform(luad_ccv_cohort.loc[:, col])
    class_label_list[col] = le.classes_
    print(f'{col} - {le.classes_}')

gender - ['female' 'male']
ajcc_pathologic_m - ['M0' 'M1' 'M1a' 'M1b' 'MX' nan]
ajcc_pathologic_n - ['N0' 'N1' 'N2' 'N3' 'NX' nan]
ajcc_pathologic_t - ['T1' 'T1a' 'T1b' 'T2' 'T2a' 'T2b' 'T3' 'T4' 'TX']
primary_diagnosis - ['Acinar cell carcinoma' 'Adenocarcinoma with mixed subtypes'
 'Adenocarcinoma, NOS' 'Bronchio-alveolar carcinoma, mucinous'
 'Bronchiolo-alveolar adenocarcinoma, NOS'
 'Bronchiolo-alveolar carcinoma, non-mucinous'
 'Clear cell adenocarcinoma, NOS' 'Micropapillary carcinoma, NOS'
 'Mucinous adenocarcinoma' 'Papillary adenocarcinoma, NOS'
 'Signet ring cell carcinoma' 'Solid carcinoma, NOS']
prior_malignancy - ['no' 'yes']
tissue_or_organ_of_origin - ['Lower lobe, lung' 'Lung, NOS' 'Main bronchus' 'Middle lobe, lung'
 'Overlapping lesion of lung' 'Upper lobe, lung']
treatment_type - ['Pharmaceutical Therapy, NOS' 'Radiation Therapy, NOS']


In [10]:
luad_ccv_cohort

Unnamed: 0,case_submitter_id,age_at_index,gender,ajcc_pathologic_m,ajcc_pathologic_n,ajcc_pathologic_t,primary_diagnosis,prior_malignancy,tissue_or_organ_of_origin,treatment_type
0,TCGA-97-8172,75,female,M0,N0,T2a,Acinar cell carcinoma,yes,"Upper lobe, lung","Radiation Therapy, NOS"
1,TCGA-78-8655,77,female,M0,N0,T1,Adenocarcinoma with mixed subtypes,no,"Lower lobe, lung","Radiation Therapy, NOS"
2,TCGA-91-6829,78,male,MX,N0,T2,"Bronchiolo-alveolar carcinoma, non-mucinous",no,"Lower lobe, lung","Radiation Therapy, NOS"
3,TCGA-86-8672,59,male,M0,N0,T3,"Adenocarcinoma, NOS",no,"Upper lobe, lung","Radiation Therapy, NOS"
4,TCGA-62-8398,55,male,M0,N2,T2,Adenocarcinoma with mixed subtypes,no,"Lower lobe, lung","Pharmaceutical Therapy, NOS"
...,...,...,...,...,...,...,...,...,...,...
504,TCGA-64-1679,58,female,M0,N2,T1,"Adenocarcinoma, NOS",no,"Upper lobe, lung","Radiation Therapy, NOS"
505,TCGA-69-7980,70,female,M0,N0,T1b,"Adenocarcinoma, NOS",no,"Upper lobe, lung","Pharmaceutical Therapy, NOS"
506,TCGA-78-7145,52,female,M1,N1,T4,"Adenocarcinoma, NOS",no,"Middle lobe, lung","Pharmaceutical Therapy, NOS"
507,TCGA-O1-A52J,74,female,MX,N0,T1,"Adenocarcinoma, NOS",no,"Lower lobe, lung","Radiation Therapy, NOS"


In [11]:
# One Hot Encoded values
ohe_list = ['ajcc_pathologic_m',
'ajcc_pathologic_n',
'ajcc_pathologic_t',
'primary_diagnosis',
'tissue_or_organ_of_origin',
'treatment_type']

ohe = OneHotEncoder(sparse=False)
for col in ohe_list:
    col_names = []
    for i in class_label_list[col]:
        col_names.append(f'{col}___{i}')
    luad_ccv_cohort_ohe.drop([col], axis=1, inplace=True)
    luad_ccv_cohort_ohe = pd.concat([luad_ccv_cohort_ohe, pd.DataFrame(ohe.fit_transform(luad_ccv_cohort.loc[:, col].values.reshape(-1, 1)), columns=col_names).iloc[:, :-1]], axis=1)

In [12]:
luad_ccv_cohort_ohe

Unnamed: 0,case_submitter_id,age_at_index,gender,prior_malignancy,ajcc_pathologic_m___M0,ajcc_pathologic_m___M1,ajcc_pathologic_m___M1a,ajcc_pathologic_m___M1b,ajcc_pathologic_m___MX,ajcc_pathologic_n___N0,...,"primary_diagnosis___Micropapillary carcinoma, NOS",primary_diagnosis___Mucinous adenocarcinoma,"primary_diagnosis___Papillary adenocarcinoma, NOS",primary_diagnosis___Signet ring cell carcinoma,"tissue_or_organ_of_origin___Lower lobe, lung","tissue_or_organ_of_origin___Lung, NOS",tissue_or_organ_of_origin___Main bronchus,"tissue_or_organ_of_origin___Middle lobe, lung",tissue_or_organ_of_origin___Overlapping lesion of lung,"treatment_type___Pharmaceutical Therapy, NOS"
0,TCGA-97-8172,75,0,1,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-78-8655,77,0,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,TCGA-91-6829,78,1,0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,TCGA-86-8672,59,1,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-62-8398,55,1,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,TCGA-64-1679,58,0,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
505,TCGA-69-7980,70,0,0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
506,TCGA-78-7145,52,0,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
507,TCGA-O1-A52J,74,0,0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Manual Labelling

For lung cancer long-term survival prediction, we chose the samples according to their OS time and OS event in their clinical data, where if a sample had an OS time longer than 60 months, we labeled the sample as 0, and if a sample had an OS time shorter than 60 months and the OS event was equal to 1, we labeled the sample as 1; we removed samples which did not come under any of the above circumstances. Then the samples which did not have the two kinds of clinical data were removed. The removed samples had no event occurring, but their OS time was less than 60 months. So we could not use these samples for training because we could not label them.

In [None]:
# Manual Labelling of patients who survived more than 5 years
days = 60*30
sample_label = []
remove_samples = []
for i in range(len(luad_survival)):
    if luad_survival.loc[i, 'OS.time'] > days:
        sample_label.append(0)
    elif (luad_survival.loc[i, 'OS.time'] < days) and (luad_survival.loc[i, 'OS'] == 1):
        sample_label.append(1)
    else:
        sample_label.append(np.nan)
luad_survival['manual_labels'] = sample_label

In [None]:
# Clean the indices
luad_survival.dropna(axis=0, inplace=True)
luad_survival = luad_survival.reset_index()
luad_survival.drop(['index'], axis=1, inplace=True)
luad_survival = luad_survival.set_index('sample')
# luad_survival

In [None]:
# Combine survival and gene expression dataframes
luad_survival_ge = pd.merge(left=luad_ge, right=luad_survival, left_index=True, right_index=True)
# luad_survival_ge

In [None]:
# Clinical data final cohort
luad_ccv_cohort_ohe_final = luad_ccv_cohort_ohe[luad_ccv_cohort_ohe['case_submitter_id'].isin(luad_survival_ge['_PATIENT'])].copy(deep=True)
luad_ccv_cohort_ohe_final.dropna(axis=0, inplace=True)

In [None]:
# Save the final patient id's
final_patient_ids = luad_ccv_cohort_ohe_final['case_submitter_id'].values.tolist()

file = open(f'data_experiments_SMOTE/patient_id_list.txt','w')
for item in final_patient_ids:
    file.write(item+"\n")
file.close()

In [None]:
# Remove the patient ids
luad_ccv_cohort_ohe_final.drop(['case_submitter_id'], axis=1, inplace=True)
# luad_ccv_cohort_ohe_final

In [None]:
# Check missing values
for col in luad_ccv_cohort_ohe_final.columns:
    print(col, luad_ccv_cohort_ohe_final[col].isna().sum())

In [None]:
# Check PID's to match with the clinical dataset
luad_survival_ge = luad_survival_ge[luad_survival_ge['_PATIENT'].isin(final_patient_ids)].copy(deep=True)
luad_survival_ge.drop(['OS', '_PATIENT', 'OS.time'], axis=1, inplace=True)

In [None]:
# Final Predictors
luad_ccv_cohort_ohe_final.columns.to_list()

## Feature Selection and Cross Validation Splitting

In [None]:
X = luad_survival_ge.iloc[:, :-1]
y = luad_survival_ge.iloc[:, -1]

In [None]:
# Variance Thresholding
selector = VarianceThreshold(threshold=10)
selector.fit(X)
X_filter = selector.transform(X)

In [None]:
# Feature Selection based on variance thresholding
genes_name=X.columns.values.tolist()
select_name_index0=selector.get_support(indices=True)
select_name0=[]
for i in select_name_index0:
    select_name0.append(genes_name[i])
len(select_name0)

In [None]:
# Display Filtered Dataframe
X_T = X_filter.T
X_T = pd.DataFrame(X_T)
X_T.index=select_name0
display(X_T.T.head())

In [None]:
# Remove duplicated features (if any)
duplicated_features = X_T.duplicated()
features_kept = [not index for index in duplicated_features]
X_unique = X_T[features_kept].T
X_unique.shape

In [None]:
# Check for missing values
for col in X_unique.columns:
    if X_unique[col].isna().sum():
        print(col)

In [None]:
X_cols = X_unique.columns.to_list()

In [None]:
X_all = np.hstack([X_unique,luad_ccv_cohort_ohe_final])
X_all.shape

In [None]:
y.value_counts()

In [None]:
def mi_based_selector(X_train, y_train, X_test, k):
    fs = SelectKBest(score_func=mutual_info_classif, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True)

for i, (train_index, test_index) in enumerate(kf.split(X_all,y)):
    X_comb_train = X_all[train_index]
    X_comb_test = X_all[test_index]
    
    ss = StandardScaler()
    X_comb_train = ss.fit_transform(X_comb_train)
    X_comb_test = ss.transform(X_comb_test)
    
    y_train = y.values[train_index]
    y_test = y.values[test_index]
    
    oversample = SMOTE()
    X_comb_train, y_train = oversample.fit_resample(X_comb_train, y_train)
    
    X_gen_train = X_comb_train[:, :X_unique.shape[1]]
    X_all_train = X_comb_train[:, X_unique.shape[1]:]
    
    X_gen_test = X_comb_test[:, :X_unique.shape[1]]
    X_all_test = X_comb_test[:, X_unique.shape[1]:]
    
    for k in [10,50, 'all']:
        X_gen_train_fs, X_gen_test_fs, fs = mi_based_selector(X_gen_train, y_train, X_gen_test, k=k)
        path = f'data_experiments_SMOTE/split{i+1}'
        np.save(os.path.join(path, f'genetic_features_{k}_train'), X_gen_train_fs)
        np.save(os.path.join(path, f'clinical_features_train'), X_all_train)
        np.save(os.path.join(path, f'labels_train'), y_train)

        np.save(os.path.join(path, f'genetic_features_{k}_test'), X_gen_test_fs)
        np.save(os.path.join(path, f'clinical_features_test'), X_all_test)
        np.save(os.path.join(path, f'labels_test'), y_test)

        kbest = fs.get_feature_names_out().tolist()
        file = open(f'data_experiments_SMOTE/k_{k}.txt','w')
        for item in kbest:
            file.write(item+"\n")
        file.close()