In [1]:
%pip install matplotlib
%pip install scikit-learn
%pip install tensorflow



In [69]:
# importing libraries
from time import time
import multiprocessing
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from glob import glob
import matplotlib.pyplot as plt
import sys
import warnings
from sklearn.utils import shuffle
import random
import logging
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore", category=DeprecationWarning)
logger = logging.getLogger("radiomics")
logger.setLevel(logging.ERROR)
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [70]:
# loadnig the data and removing the useless column 'Unnamed: 0'
whole_data = pd.read_csv('/content/gdrive/MyDrive/radiomics/Chexpert_synced_data_nan_free.csv')
whole_data.head(5)


Unnamed: 0.1,Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,0,-669.603949,685.781964,10595200000.0,6.0176,866.092155,1.713989,934.117676,428.763596,10.27966,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,-801.474548,564.486328,12747140000.0,6.14663,954.811813,1.756523,1020.768616,468.058052,-132.958831,...,0.0,1.0,1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,1.0
2,2,-844.417249,307.704349,9364581000.0,5.848908,690.78833,1.91363,589.552979,354.572276,-308.642302,...,0.0,1.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
3,4,-742.036096,673.796082,11322590000.0,6.076017,1038.149048,1.627926,933.427429,467.972161,-17.9099,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,5,-802.322357,560.980408,14105470000.0,6.061123,783.288322,2.001557,869.513733,412.838385,-210.721353,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0


In [71]:
# excludiung Ages with Nans
whole_data = whole_data[~pd.isnull(whole_data['Age'])]
whole_data.shape

(188420, 489)

In [72]:
random_seed = 21202
del whole_data['Unnamed: 0']
whole_data.head()
whole_data = whole_data.sample(frac=1.0, random_state=random_seed)

In [73]:
# finding total number of samples:
n_samples = whole_data.shape[0]
print("number of samples are: {}".format(n_samples))

number of samples are: 188420


In [74]:
# names of all targets:
targets_names = list(whole_data.iloc[:,464:])
print(targets_names)

['index', 'Path', 'Patient_id', 'Study_Number', 'Sex', 'Age', 'Race', 'Ethnicity', 'Frontal/Lateral', 'AP/PA', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices']


In [75]:
# finding the number of unique subjects:
unique_subjects = pd.unique(whole_data['Patient_id'])
print('The number of unique subjects are: {0}'.format(len(unique_subjects)))

The number of unique subjects are: 64484


In [76]:
# preallocating the frequency of session of the subjects:
# this is written to make sure that we dont use a subject in both training and testing split.
test_split = 0.06       # precent out of total samples
valid_split = 0.1       # precent out of total samples (this is the sumation of both training and validation splits)

n_test = n_samples * test_split
n_whole_train = n_samples - n_test

frequencies = np.ones((unique_subjects.shape[0], 4)) * np.nan
for i, sbj in tqdm(enumerate(unique_subjects)):
  frequencies[i, 0] = sbj
  frequencies[i, 1] = (whole_data['Patient_id'] == sbj).sum()
cumolative_sums_of_samples = np.cumsum(frequencies[:,1])
frequencies[:, 2] = cumolative_sums_of_samples
frequencies[:, 3] = cumolative_sums_of_samples/cumolative_sums_of_samples[~0]

64484it [00:25, 2497.08it/s]


In [77]:
# visualizing the frequency  of subjects and images taken from each
vis_frequencies = pd.DataFrame(frequencies,columns=['subject_ID','n_of_sessions/images','cumolative_sessions','preccent_of_number_of_images'])
vis_frequencies.head(15)

Unnamed: 0,subject_ID,n_of_sessions/images,cumolative_sessions,preccent_of_number_of_images
0,14707.0,3.0,3.0,1.6e-05
1,59349.0,2.0,5.0,2.7e-05
2,38301.0,38.0,43.0,0.000228
3,17270.0,1.0,44.0,0.000234
4,37343.0,3.0,47.0,0.000249
5,35867.0,3.0,50.0,0.000265
6,28124.0,1.0,51.0,0.000271
7,35272.0,13.0,64.0,0.00034
8,27312.0,2.0,66.0,0.00035
9,26974.0,3.0,69.0,0.000366


In [78]:
# separating the testing split without having any mutual_subjects:
testing_subjects_masking = vis_frequencies['preccent_of_number_of_images'] < test_split
testing_subjects = vis_frequencies[testing_subjects_masking]['subject_ID'].values.tolist()
whole_train_subjects = vis_frequencies[~testing_subjects_masking]['subject_ID'].values.tolist()

In [79]:
# excluding irrelevant features
excluding2ds =  np.concatenate( (np.arange(32,41,dtype=int),
                                 np.arange(32,41,dtype=int)+ 116,
                                 np.arange(32,41,dtype=int)+116*2,
                                 np.arange(32,41,dtype=int)+116*3))

whole_data.drop(whole_data.columns[excluding2ds.tolist()],axis = 1,inplace=True)

In [80]:
# separating test_data and whole_trainig_data
Test = whole_data[whole_data['Patient_id'].isin(testing_subjects)]
X_test, y_test = Test.iloc[:,:464-36].values, Test['Age'].values

# separating test_data and whole_trainig_data
Train = whole_data[whole_data['Patient_id'].isin(whole_train_subjects)]

# the size of training/testing_split:
print('The size of the testing split is: {}'.format(Test.shape))
print('The size of the whole trainig (train + valid) split is: {}'.format(Train.shape))

The size of the testing split is: (11303, 452)
The size of the whole trainig (train + valid) split is: (177117, 452)


In [81]:
# splitting the validation split:
X_train, X_val, y_train, y_val = train_test_split(Train.iloc[:,:464-36].values,
                                                  Train['Age'].values,
                                                  test_size=valid_split,
                                                  random_state=random_seed)

In [82]:
# checking the number of nans:
pd.isnull(X_train).sum(), pd.isnull(X_test).sum(), pd.isnull(X_val).sum()

(0, 0, 0)

In [83]:
print('The size of the validation split is: {}'.format(X_val.shape))
X_train.shape, X_val.shape, y_train.shape, y_val.shape

The size of the validation split is: (17712, 428)


((159405, 428), (17712, 428), (159405,), (17712,))

In [84]:
# segment separation
segment_number = 0
X_train = X_train[:, 107*segment_number:107*(segment_number + 1)]
X_val = X_val[:, 107*segment_number:107*(segment_number + 1)]
X_test = X_test[:, 107*segment_number:107*(segment_number + 1)]


# normalizing the data
mean_val = X_train.mean(axis=0)
std_val = X_train.std(axis=0)

In [85]:
# excluding invariant variables:
mask_excluded = ~(std_val==0)

In [86]:
X_train[:,mask_excluded] -= mean_val[mask_excluded]
X_train[:,mask_excluded] /= std_val[mask_excluded]

X_val[:,mask_excluded] -= mean_val[mask_excluded]
X_val[:,mask_excluded] /= std_val[mask_excluded]

X_test[:,mask_excluded] -= mean_val[mask_excluded]
X_test[:,mask_excluded] /= std_val[mask_excluded]

In [87]:
saving_path = '/content/gdrive/MyDrive/radiomics/age_chexpert/chexpert_age_segment_' + str(segment_number)+ '_'
belongings={'mean':mean_val,
            'segment_number':segment_number,
            'std':std_val,
            'mask_excluded':mask_excluded,
            'x_test':X_test,
            'y_test':y_test}
np.save(saving_path + 'belongings.npy',belongings)

In [88]:
# defining the model structure for the age
# %% building uncompiled mdodel

checkpoint_filepath = saving_path +'_mdl.h5'
model_checkpoint_callback_1 = tf.keras.callbacks.ModelCheckpoint(
                                                                filepath=checkpoint_filepath,
                                                                save_weights_only=False,
                                                                monitor='val_loss',
                                                                save_best_only=True,
                                                                save_freq='epoch',
                                                                verbose=1
                                                            )


def classifer_1(features_number: int) -> callable:
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=100, activation="relu", input_dim=features_number))
    model.add(keras.layers.Dropout(0.0))
    model.add(keras.layers.Dense(units=100, activation="relu"))
    model.add(keras.layers.Dropout(0.0))
    model.add(keras.layers.Dense(units=1))
    model.summary()
    return model

def compiler_1(X_train_, Y_train_, X_valid_, Y_valid_):
  initial_learning_rate = 0.0001
  epochs = 500
  batch_size = 128
  model = classifer_1(features_number=X_train_.shape[1])
  # lss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
  # metrics = ['accuracy',
  #                       tf.keras.metrics.AUC(curve='PR',name='PR-curve'),
  #                       tf.keras.metrics.AUC(curve='ROC',name='ROC-curve')]
  # metrics = metrics=[tf.keras.metrics.RootMeanSquaredError()]
  metrics = [tf.keras.metrics.RootMeanSquaredError()]
  lss = 'mean_absolute_error'
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=initial_learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-08),
                                                loss=lss,
                metrics=metrics)
  model.fit(x=X_train_,
            y=Y_train_,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_valid_, Y_valid_),
            verbose=2,
            shuffle=True,
            callbacks=[model_checkpoint_callback_1])
  return model

In [None]:
model_1 = compiler_1(X_train_ = X_train[:,mask_excluded],
                    Y_train_ = y_train,
                    X_valid_ = X_val[:,mask_excluded],
                    Y_valid_ = y_val)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 100)               10600     
                                                                 
 dropout_6 (Dropout)         (None, 100)               0         
                                                                 
 dense_10 (Dense)            (None, 100)               10100     
                                                                 
 dropout_7 (Dropout)         (None, 100)               0         
                                                                 
 dense_11 (Dense)            (None, 1)                 101       
                                                                 
Total params: 20,801
Trainable params: 20,801
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500

Epoch 1: val_loss improved from inf t