In [15]:
%reset

In [22]:
import tensorflow as tf
import importlib
import module_imports
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import itertools
importlib.reload(module_imports)
from module_imports import *

Using tensorflow version: 2.4.0


In [23]:
SCRIPT_ID = 'TS1'

# dirs
results_dir = module_paths.Path(f'Results_{module_time.ymd()}')
results_dir.create_dir()
cv_results_dir = module_paths.Path(f'CVResults_{module_time.ymd()}')
cv_results_dir.create_dir()
exception_dir = module_paths.Path(f'Exceptions')
exception_dir.create_dir()
metadata_dir = module_paths.Path('Metadata')
metadata_dir.create_dir()
metadata_folds_dir = module_paths.Path(f'Metadata\\Folds')
metadata_folds_dir.create_dir()
metadata_models_dir = module_paths.Path(f'Metadata\\Models')
metadata_models_dir.create_dir()
trained_models_dir = module_paths.Path(f'TrainedModels')
trained_models_dir.create_dir()


exceptions_file_path = f'{exception_dir.path}\\Exceptions_{SCRIPT_ID}.txt'

In [24]:
# DATASET
data_dir = '../data/'
dataset = pd.read_excel(f'{data_dir}\\hrv_24h.xlsx')
dataset.drop('HbA1C(%)', axis = 1, inplace = True)
dataset.replace({'GD': 0, 'BD': 1}, inplace = True)

In [25]:
# SEARCH SPACE

search_space = dict(
        patience = [100],
        decay=[1e-2],
        early_stopping_flag = [True],
        validation_split = [0.1],
        epochs = [100,500,1000],
        optimizer = ['Nadam', 'Adam', 'RMSProp'],
        batch_size = [50,100,200,500],
        initializer = ['GlorotNormal', 'VarianceScaling'],
        learning_rate = [0.2, 0.1, 0.02],
        layers_list = [
            [
                {'units': 8, 'type': 'dense', 'activation': 'relu'},
                {'units': 4, 'type': 'dense', 'activation': 'relu'},
            ],
            [
                {'units': 9, 'type': 'dense', 'activation': 'relu'},
                {'units': 6, 'type': 'dense', 'activation': 'relu'},
                {'units': 3, 'type': 'dense', 'activation': 'relu'},
            ],
            [
                {'units': 10, 'type': 'dense', 'activation': 'relu'},
                {'units': 8, 'type': 'dense', 'activation': 'relu'},
                {'units': 6, 'type': 'dense', 'activation': 'relu'},
                {'units': 4, 'type': 'dense', 'activation': 'relu'},

            ],
            [
                {'units': 10, 'type': 'dense', 'activation': 'relu'},
                {'units': 8, 'type': 'dense', 'activation': 'relu'},
                {'units': 6, 'type': 'dense', 'activation': 'relu'},
                {'units': 4, 'type': 'dense', 'activation': 'relu'},
                {'units': 2, 'type': 'dense', 'activation': 'relu'},

            ],
        ],
        loss = ['CategoricalHinge'],
        regularizer = ['tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)', 'tf.keras.regularizers.l1(l1=0.01)', 'tf.keras.regularizers.l2(l2=0.01)']
)

# combinations
keys, values = zip(*search_space.items())
combinations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

# tabular combinations
model_names = [f'M{i+1}' for i in range(len(combinations_dicts))]
combinations_df = pd.DataFrame(combinations_dicts)
combinations_df.insert(0, 'Model', model_names)
combinations_df['Trained'] = 'No'

# save
search_space_path = f'{metadata_models_dir.path}\\search_space_{SCRIPT_ID}.xlsx'
combinations_df.to_excel(search_space_path, index = False)

In [26]:
# FOLDS

random_states = [11,22,33,44,55,66,77,88,99,111]

split_sets_cv = dict()
train_folds_analysis_dict = dict()
test_folds_analysis_dict = dict()

for fold_id in range(10):

    train = None
    test = None
    found_good_test_set = False

    while found_good_test_set is False:

        train, test = module_train_test.find_best_group_split(dataframe = dataset,
                                                             target_feature = 'Class',
                                                             group_by_feature = 'Patient_ID',
                                                             balance_focus = 'train_with_test_threshold',
                                                             test_balance_treshold = 3,
                                                             num_splits_to_try = 200,
                                                             random_state = random_states[fold_id],
                                                             test_size = 0.04)

        train_folds_analysis_dict[fold_id] = module_dataset_analysis.quantitative_analysis(df = train,
                                                                                          dataset_name = f'fold{fold_id}',
                                                                                          class_feature = 'Class',
                                                                                          classes = [0, 1],
                                                                                          ratios = True)
        test_folds_analysis_dict[fold_id] = module_dataset_analysis.quantitative_analysis(df = test,
                                                                                         dataset_name = f'fold{fold_id}',
                                                                                         class_feature = 'Class',
                                                                                         classes = [0, 1],
                                                                                         ratios = True)

        if test_folds_analysis_dict[fold_id]['0 samples'] < 90 or test_folds_analysis_dict[fold_id]['1 samples'] < 90:
            random_states[fold_id] += 1
        else:
            found_good_test_set = True

    train.reset_index(inplace = True, drop = True)
    test.reset_index(inplace = True, drop = True)

    train_patients = train.pop('Patient_ID')
    test_patients = test.pop('Patient_ID')

    overlap = list(set(train_patients.unique()).intersection(set(test_patients.unique())))
    print('Overlap: ' + str(overlap))
    assert dataset['Patient_ID'].nunique() == train_patients.nunique() + test_patients.nunique(), 'Problem with patients'

    split_sets_cv[f'y_train_{fold_id}'] = train.pop('Class').to_numpy()
    split_sets_cv[f'y_test_{fold_id}'] = test.pop('Class').to_numpy()

    standard_scaler = StandardScaler()
    pca = PCA(n_components = 12)

    train = standard_scaler.fit_transform(train)
    test = standard_scaler.transform(test)

    train = pca.fit_transform(train)
    test = pca.transform(test)

    split_sets_cv[f'X_train_{fold_id}'] = train
    split_sets_cv[f'X_test_{fold_id}'] = test


pd.Series(random_states, name = 'random_states').to_excel(f'{metadata_folds_dir.path}\\random_states_{SCRIPT_ID}.xlsx', index = False)
pd.DataFrame(train_folds_analysis_dict).T.to_excel(f'{metadata_folds_dir.path}\\train_folds_analysis_{SCRIPT_ID}.xlsx', index = False)
pd.DataFrame(test_folds_analysis_dict).T.to_excel(f'{metadata_folds_dir.path}\\test_folds_analysis_{SCRIPT_ID}.xlsx', index = False)

Overlap: []
Overlap: []
Overlap: []
Overlap: []
Overlap: []
Overlap: []
Overlap: []
Overlap: []
Overlap: []
Overlap: []


In [27]:
# TRAINING

start_time = module_time.hmymd()
more_models_left_to_train = True

fold_results_list = list()
cv_results_list = list()

while more_models_left_to_train:

    search_space_state = pd.read_excel(search_space_path)

    model_to_train = search_space_state.loc[search_space_state['Trained'] == 'No'].iloc[0]
    print(f'Training model:\n {model_to_train}')

    search_space_state.loc[search_space_state['Model'] == model_to_train['Model'], 'Trained'] = 'InProgress'
    search_space_state.to_excel(search_space_path, index = False)

    model_fold_results_list = list()
    for fold_id in range(10):

        y_train = split_sets_cv[f'y_train_{fold_id}']
        y_test = split_sets_cv[f'y_test_{fold_id}']
        X_train = split_sets_cv[f'X_train_{fold_id}']
        X_test = split_sets_cv[f'X_test_{fold_id}']

        # try:
        model_id = f"{model_to_train['Model']}_Fold{fold_id}"

        binary_classifier = module_dl.BinaryClassificationDL(X_train = X_train,
                                                            y_train = y_train,
                                                            X_test = X_test,
                                                            y_test = y_test,
                                                            epochs = model_to_train['epochs'],
                                                            batch_size = model_to_train['batch_size'],
                                                            learning_rate = model_to_train['learning_rate'],
                                                            decay = model_to_train['decay'],
                                                            validation_split = model_to_train['validation_split'],
                                                            loss_name = model_to_train['loss'],
                                                            initializer_name = model_to_train['initializer'],
                                                            optimizer_name = model_to_train['optimizer'],
                                                            early_stopping_flag = True,
                                                            patience = model_to_train['patience'],
                                                            layers_list = eval(model_to_train['layers_list']),
                                                            model_type = 'sequential',
                                                            regularizer = eval(model_to_train['regularizer']),
                                                            trained_models_dir = trained_models_dir,
                                                            model_id = model_id
                                                            )
        binary_classifier.fit_model()
        test_report = binary_classifier.predict(return_metrics = True)

        fold_results_list.append(dict(model = model_id, **test_report, **model_to_train.to_dict()))
        model_fold_results_list.append(test_report)

        # except Exception as e:
        #
        #     print(str(e))
        #     with open(exceptions_file_path, 'a') as exceptions_file:
        #         exceptions_file.write(f'{str(e)}\n')

    model_cv_results = pd.Series({'Model': model_to_train['Model']})\
        .append(pd.DataFrame(model_fold_results_list).mean())\
        .append(model_to_train).to_dict()

    cv_results_list.append(model_cv_results)

    pd.DataFrame(fold_results_list).to_excel(f'{results_dir.path}\\Fold_Results_{SCRIPT_ID}.xlsx')
    pd.DataFrame(cv_results_list).to_excel(f'{cv_results_dir.path}\\CV_Results_{SCRIPT_ID}.xlsx')

    search_space_state = pd.read_excel(search_space_path)
    search_space_state.loc[search_space_state['Model'] == model_to_train['Model'], 'Trained'] = 'Yes'
    search_space_state.to_excel(search_space_path, index = False)

Training model:
 Model                                                                 M1
patience                                                             100
decay                                                               0.01
early_stopping_flag                                                 True
validation_split                                                     0.1
epochs                                                               100
optimizer                                                          Nadam
batch_size                                                            50
initializer                                                 GlorotNormal
learning_rate                                                        0.2
layers_list            [{'units': 8, 'type': 'dense', 'activation': '...
loss                                                    CategoricalHinge
regularizer                tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)
Trained                           



Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 8)                 104       
_________________________________________________________________
dense_40 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_41 (Dense)             (None, 1)                 5         
Total params: 145
Trainable params: 145
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100



Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_42 (Dense)             (None, 8)                 104       
_________________________________________________________________
dense_43 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_44 (Dense)             (None, 1)                 5         
Total params: 145
Trainable params: 145
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

  .append(pd.DataFrame(model_fold_results_list).mean())\


Training model:
 Model                                                                 M2
patience                                                             100
decay                                                               0.01
early_stopping_flag                                                 True
validation_split                                                     0.1
epochs                                                               100
optimizer                                                          Nadam
batch_size                                                            50
initializer                                                 GlorotNormal
learning_rate                                                        0.2
layers_list            [{'units': 8, 'type': 'dense', 'activation': '...
loss                                                    CategoricalHinge
regularizer                            tf.keras.regularizers.l1(l1=0.01)
Trained                           

  npv = round(true_negatives / (true_negatives + false_negatives), 4)


Training model:
 Model                                                                 M4
patience                                                             100
decay                                                               0.01
early_stopping_flag                                                 True
validation_split                                                     0.1
epochs                                                               100
optimizer                                                          Nadam
batch_size                                                            50
initializer                                                 GlorotNormal
learning_rate                                                        0.2
layers_list            [{'units': 9, 'type': 'dense', 'activation': '...
loss                                                    CategoricalHinge
regularizer                tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)
Trained                           

  precision = round(true_positives / (true_positives + false_positives), 4)


Model: "sequential_60"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_197 (Dense)            (None, 9)                 117       
_________________________________________________________________
dense_198 (Dense)            (None, 6)                 60        
_________________________________________________________________
dense_199 (Dense)            (None, 3)                 21        
_________________________________________________________________
dense_200 (Dense)            (None, 1)                 4         
Total params: 202
Trainable params: 202
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/1



Model: "sequential_103"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_399 (Dense)            (None, 10)                130       
_________________________________________________________________
dense_400 (Dense)            (None, 8)                 88        
_________________________________________________________________
dense_401 (Dense)            (None, 6)                 54        
_________________________________________________________________
dense_402 (Dense)            (None, 4)                 28        
_________________________________________________________________
dense_403 (Dense)            (None, 2)                 10        
_________________________________________________________________
dense_404 (Dense)            (None, 1)                 3         
Total params: 313
Trainable params: 313
Non-trainable params: 0
______________________________________________________

KeyboardInterrupt: 