In [None]:
# we need the following libraries, so let's install them
%pip install matplotlib
%pip install scikit-learn
%pip install tensorflow



In [1]:
# importing libraries
from time import time
import multiprocessing
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from glob import glob
import matplotlib.pyplot as plt
import sys
import warnings
from sklearn.utils import shuffle
import random
import logging
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore", category=DeprecationWarning)
logger = logging.getLogger("radiomics")
logger.setLevel(logging.ERROR)
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# loadnig the data and removing the useless column 'Unnamed: 0'
whole_data = pd.read_csv('/content/gdrive/MyDrive/radiomics/NIH_synced_data_nan_free.csv')
whole_data.head(5)

Unnamed: 0.1,Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
0,0,-405.629779,49.662019,2341686000.0,4.640159,330.131351,1.720449,196.329468,157.375266,-175.517427,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,-184.213315,696.721558,9120388000.0,5.511916,546.484323,1.844329,865.631409,276.160265,250.186759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,-676.460406,214.247101,12031170000.0,5.623317,525.821607,2.576996,818.862732,290.994217,-295.582659,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,-741.963159,322.000031,9176606000.0,5.65821,558.405931,2.510349,694.25885,322.914612,-331.656328,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,-608.359229,232.310629,3715405000.0,5.523623,506.580425,2.096828,489.584351,266.226002,-238.386621,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [7]:
# excludiung Ages with Nans
whole_data = whole_data[~pd.isnull(whole_data['Patient Age'])]
whole_data.shape

(111086, 486)

In [8]:
random_seed = 21202
del whole_data['Unnamed: 0']
whole_data.head()
whole_data = whole_data.sample(frac=1.0, random_state=random_seed)

In [9]:
# finding total number of samples:
n_samples = whole_data.shape[0]
print("number of samples are: {}".format(n_samples))

number of samples are: 111086


In [10]:
# names of all targets:
targets_names = list(whole_data.iloc[:,464:])
print(targets_names)

['index', 'Image Index', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position', 'No Finding', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


In [11]:
# finding the number of unique subjects:
unique_subjects = pd.unique(whole_data['Patient ID'])
print('The number of unique subjects are: {0}'.format(len(unique_subjects)))

The number of unique subjects are: 30703


In [12]:
# preallocating the frequency of session of the subjects:
# this is written to make sure that we dont use a subject in both training and testing split.
test_split = 0.06       # precent out of total samples
valid_split = 0.1       # precent out of total samples (this is the sumation of both training and validation splits)

n_test = n_samples * test_split
n_whole_train = n_samples - n_test

frequencies = np.ones((unique_subjects.shape[0], 4)) * np.nan
for i, sbj in tqdm(enumerate(unique_subjects)):
  frequencies[i, 0] = sbj
  frequencies[i, 1] = (whole_data['Patient ID'] == sbj).sum()
cumolative_sums_of_samples = np.cumsum(frequencies[:,1])
frequencies[:, 2] = cumolative_sums_of_samples
frequencies[:, 3] = cumolative_sums_of_samples/cumolative_sums_of_samples[~0]

30703it [00:09, 3372.54it/s]


In [13]:
# visualizing the frequency  of subjects and images taken from each
vis_frequencies = pd.DataFrame(frequencies,columns=['subject_ID','n_of_sessions/images','cumolative_sessions','preccent_of_number_of_images'])
vis_frequencies.head(15)

Unnamed: 0,subject_ID,n_of_sessions/images,cumolative_sessions,preccent_of_number_of_images
0,27115.0,4.0,4.0,3.6e-05
1,2046.0,6.0,10.0,9e-05
2,6991.0,15.0,25.0,0.000225
3,19339.0,2.0,27.0,0.000243
4,13670.0,163.0,190.0,0.00171
5,5365.0,30.0,220.0,0.00198
6,6179.0,1.0,221.0,0.001989
7,22618.0,4.0,225.0,0.002025
8,16043.0,6.0,231.0,0.002079
9,19550.0,19.0,250.0,0.002251


In [14]:
# separating the testing split without having any mutual_subjects:
testing_subjects_masking = vis_frequencies['preccent_of_number_of_images'] < test_split
testing_subjects = vis_frequencies[testing_subjects_masking]['subject_ID'].values.tolist()
whole_train_subjects = vis_frequencies[~testing_subjects_masking]['subject_ID'].values.tolist()

In [15]:
# excluding irrelevant features
excluding2ds =  np.concatenate( (np.arange(32,41,dtype=int),
                                 np.arange(32,41,dtype=int)+ 116,
                                 np.arange(32,41,dtype=int)+116*2,
                                 np.arange(32,41,dtype=int)+116*3))

whole_data.drop(whole_data.columns[excluding2ds.tolist()],axis = 1,inplace=True)

In [16]:
# separating test_data and whole_trainig_data
Test = whole_data[whole_data['Patient ID'].isin(testing_subjects)]
X_test, y_test = Test.iloc[:,:464-36].values, Test['Patient Age'].values

# separating test_data and whole_trainig_data
Train = whole_data[whole_data['Patient ID'].isin(whole_train_subjects)]

# the size of training/testing_split:
print('The size of the testing split is: {}'.format(Test.shape))
print('The size of the whole trainig (train + valid) split is: {}'.format(Train.shape))

The size of the testing split is: (6664, 449)
The size of the whole trainig (train + valid) split is: (104422, 449)


In [17]:
# splitting the validation split:
X_train, X_val, y_train, y_val = train_test_split(Train.iloc[:,:464-36].values,
                                                  Train['Patient Age'].values,
                                                  test_size=valid_split,
                                                  random_state=random_seed)

In [None]:
# checking the number of nans:
pd.isnull(X_train).sum(), pd.isnull(X_test).sum(), pd.isnull(X_val).sum()

(0, 0, 0)

In [None]:
print('The size of the validation split is: {}'.format(X_val.shape))
X_train.shape, X_val.shape, y_train.shape, y_val.shape

The size of the validation split is: (10443, 428)


((93979, 428), (10443, 428), (93979,), (10443,))

In [None]:
# segment separation
segment_number = 0
X_train = X_train[:, 107*segment_number:107*(segment_number + 1)]
X_val = X_val[:, 107*segment_number:107*(segment_number + 1)]
X_test = X_test[:, 107*segment_number:107*(segment_number + 1)]


# normalizing the data
mean_val = X_train.mean(axis=0)
std_val = X_train.std(axis=0)


In [None]:
# excluding invariant variables:
mask_excluded = ~(std_val==0)

In [None]:
X_train[:,mask_excluded] -= mean_val[mask_excluded]
X_train[:,mask_excluded] /= std_val[mask_excluded]

X_val[:,mask_excluded] -= mean_val[mask_excluded]
X_val[:,mask_excluded] /= std_val[mask_excluded]

X_test[:,mask_excluded] -= mean_val[mask_excluded]
X_test[:,mask_excluded] /= std_val[mask_excluded]

In [None]:

saving_path = '/content/gdrive/MyDrive/radiomics/second_results/nih_age_segment_' + str(segment_number)+ '_'
belongings={'mean':mean_val,
            'segment_number':segment_number,
            'std':std_val,
            'mask_excluded':mask_excluded,
            'x_test':X_test,
            'y_test':y_test}
np.save(saving_path + 'belongings.npy',belongings)

In [None]:
# defining the model structure for the age
# %% building uncompiled mdodel

checkpoint_filepath = saving_path +'_mdl.h5'
model_checkpoint_callback_1 = tf.keras.callbacks.ModelCheckpoint(
                                                                filepath=checkpoint_filepath,
                                                                save_weights_only=False,
                                                                monitor='val_loss',
                                                                save_best_only=True,
                                                                save_freq='epoch',
                                                                verbose=1
                                                            )


def classifer_1(features_number: int) -> callable:
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=200, activation="tanh", input_dim=features_number))
    model.add(keras.layers.Dropout(0.0))
    model.add(keras.layers.Dense(units=200, activation="tanh"))
    model.add(keras.layers.Dropout(0.0))
    model.add(keras.layers.Dense(units=1))
    model.summary()
    return model

def compiler_1(X_train_, Y_train_, X_valid_, Y_valid_):
  initial_learning_rate = 0.00001
  epochs = 500
  batch_size = 256
  model = classifer_1(features_number=X_train_.shape[1])
  # lss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
  # metrics = ['accuracy',
  #                       tf.keras.metrics.AUC(curve='PR',name='PR-curve'),
  #                       tf.keras.metrics.AUC(curve='ROC',name='ROC-curve')]
  # metrics = metrics=[tf.keras.metrics.RootMeanSquaredError()]
  metrics = [tf.keras.metrics.RootMeanSquaredError()]
  lss = 'mean_absolute_error'
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=initial_learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-08),
                                                loss=lss,
                metrics=metrics)
  model.fit(x=X_train_,
            y=Y_train_,
            epochs=epochs,
            batch_size=batch_size,
            initial_epoch=0,
            validation_data=(X_valid_, Y_valid_),
            verbose=2,
            shuffle=True,
            callbacks=[model_checkpoint_callback_1])
  return model

In [None]:
model_1 = compiler_1(X_train_ = X_train[:,mask_excluded],
                    Y_train_ = y_train,
                    X_valid_ = X_val[:,mask_excluded],
                    Y_valid_ = y_val)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 200)               21200     
                                                                 
 dropout_6 (Dropout)         (None, 200)               0         
                                                                 
 dense_10 (Dense)            (None, 200)               40200     
                                                                 
 dropout_7 (Dropout)         (None, 200)               0         
                                                                 
 dense_11 (Dense)            (None, 1)                 201       
                                                                 
Total params: 61,601
Trainable params: 61,601
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500

Epoch 1: val_loss improved from inf t