# Perform 5-fold CV

- In a script: 
    - First do the train-test split
    - Then, Stratified KFold
    - For each split,
        - Save the train set
        - Save the val set

- For each split, run a model for that 
    - Record results somewhere
    - Don't need to save model

- Once you have done that
    - Do a final training with the full training dataset
    - Evaluate with test set

In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split

data_filepath = r"C:\Users\ammar\Documents\Python ML neuropysch disease\Diagnosing-ADHD-With-ConvLSTM-master\References\model_data.csv"
training_filepath = r"C:\Users\ammar\Documents\Python ML neuropysch disease\Diagnosing-ADHD-With-ConvLSTM-master\Data\training_data_{}"
validation_filepath = r"C:\Users\ammar\Documents\Python ML neuropysch disease\Diagnosing-ADHD-With-ConvLSTM-master\Data\validatation_data_{}"
testing_filepath = r"C:\Users\ammar\Documents\Python ML neuropysch disease\Diagnosing-ADHD-With-ConvLSTM-master\Datatesting_data"

model_csv = data_filepath
model_data = pd.read_csv(model_csv)

X = model_data['Image']
y = model_data['DX']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

count = 1
skf = StratifiedKFold(n_splits=5)
for train_index, val_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    model_train_data = pd.DataFrame(pd.concat([X_train_fold, y_train_fold], axis=1))
    model_val_data = pd.DataFrame(pd.concat([X_val_fold, y_val_fold], axis=1))
    
    model_train_data.to_csv(training_filepath.format(count), index=False)
    model_val_data.to_csv(validation_filepath.format(count), index=False)
    
    count += 1

model_test_data = pd.DataFrame(pd.concat([X_test, y_test], axis=1))
model_test_data.to_csv(testing_filepath, index=False)

In [3]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("..")

from Code.data_generator import FMRIDataGenerator

In [3]:
import numpy as np
import pandas as pd
import os
from datetime import datetime

import tensorflow as tf

from tensorflow.keras.layers import Conv3D, MaxPool3D, TimeDistributed, Flatten, LSTM, Dense
from tensorflow.keras import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import CSVLogger

import tensorflow.keras as keras

import logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
logging.getLogger('tensorflow').setLevel(logging.FATAL)

# ============================ DATA WORK ============================

# Dataframes
file_num = "peking" #1
dataset_dir = r"D:/Peking_1"
model_train_data = pd.read_csv(r"C:\Users\ammar\Documents\Python ML neuropysch disease\Diagnosing-ADHD-With-ConvLSTM-master\Data\training_{}.csv".format(file_num) )
model_val_data = pd.read_csv(r"C:\Users\ammar\Documents\Python ML neuropysch disease\Diagnosing-ADHD-With-ConvLSTM-master\Data\validation_{}.csv".format(file_num) )

# Dictionary of data values
partition = {'train': model_train_data['Image'].values, 
             'validation': model_val_data['Image'].values}

# Training Data
train_labels = {}
for index, row in model_train_data.iterrows():
    train_labels[row['Image']] = row['DX']
    
# Validation Data
val_labels = {}
for index, row in model_val_data.iterrows():
    val_labels[row['Image']] = row['DX']

# ============================ MODEL META ============================

epochs = 500
batch_size = 6
input_shape=(177,28,28,28,1)

train_steps_per_epoch = model_train_data.shape[0] // batch_size
validate_steps_per_epoch = model_val_data.shape[0] // batch_size

# Generators
training_generator = FMRIDataGenerator(partition['train'], train_labels, dataset_dir, batch_size)
validation_generator = FMRIDataGenerator(partition['validation'], val_labels, dataset_dir, batch_size)

curr_time = f'{datetime.now():%H-%M-%S%z_%m%d%Y}'
logger_path = "/pylon5/cc5614p/deopha32/Saved_Models/adhd-fmri-history_cv{num}_{time}.csv".format(num=file_num,time=curr_time)

csv_logger = CSVLogger(logger_path, append=True)

callbacks = [csv_logger]

# ============================ MODEL ARCHITECTURE ============================

with tf.device('/gpu:0'):
    cnn_lstm_model = Sequential()

    cnn_lstm_model.add(TimeDistributed(Conv3D(filters=64,kernel_size=(3,3,3),activation='relu'),
                                  input_shape=input_shape, name="Input_Conv_Layer"))

    cnn_lstm_model.add(TimeDistributed(MaxPool3D(
                                    pool_size=(2, 2, 2),
                                    strides=(2, 2, 2),
                                    padding='valid'
                                    ), name="Pool_Layer_1"))

    cnn_lstm_model.add(TimeDistributed(Flatten(), name="Flatten_Layer"))
    
with tf.device('/cpu:0'):

    cnn_lstm_model.add(LSTM(10, dropout = 0.3, recurrent_dropout = 0.3, name="LSTM_Layer"))

with tf.device('/gpu:0'):

    cnn_lstm_model.add(Dense(1, activation = 'sigmoid', name="Output_Dense_Layer"))

    cnn_lstm_model.compile(optimizer=optimizers.Adam(lr=0.0001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

NameError: name 'FMRIDataGenerator' is not defined

In [None]:
cnn_lstm_model.fit_generator(generator=training_generator,
    steps_per_epoch=train_steps_per_epoch, verbose=1, callbacks=callbacks,
    validation_data=validation_generator, validation_steps=validate_steps_per_epoch,
    epochs=epochs)