In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, callbacks
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split
from os import listdir, makedirs
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
import time
import random

In [4]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('targetValue')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()
    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])
    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)
    return normalizer

def load_subsample_data(sample_array, subsample_array):
    """
    %author Łukasz Ozimek
    Function to use arrays of sample numbers to load in data of subsample audio features.
    
    Parameter:
    sample_array - array of sample numbers
    subsample_array - array of subsample data
    
    Returns:
    final_array - array of subsample data for selected samples
    """
    final_array = np.zeros(shape=(int(len(sample_array)*5),len(subsample_array.columns)))
    idx = 0
    for i in sample_array:
        num = i*5
        for n in range(0,5):
            final_array[idx]=subsample_array[num+n:num+n+1]
            idx+=1
    return final_array

In [5]:
def data_model(path, save_path, save_name_start = 'train', iter_num = 5, epoch_num =20):
    """
    Function that creates datasets out of csv files and normalizes them using Keras layer preprocessing. Then a model is 
    created     and trained multiple times. Then it's metrics are saved and their summary is printed in the console.
    
    Parameters:
    path - filepath of the csv document with features
    save_path - directory where model metrics will be saved
    save_name_start - beginning of each saved file
    iter_num - number of Kfold splits
    epoch_num - number of epochs
    neurons - number of neurons on first dense layer
    
    Returns:
    df - Padnas DataFrame with best metrics from each cycle
    summary - model summary
    """
    try:
        makedirs(save_path)
    except:
        pass
    save_path = save_path+save_name_start
    file = path
    df = pd.read_csv(file)
    df.drop('Unnamed: 0', axis=1, inplace = True)
    base = list(range(0,int(len(df)/5))) #array of numbers equal to number of whole samples
    
    # Defining list to store best values
    best_loss = []
    best_acc = []
    best_loss_val = []
    best_acc_val = []
    batch_size = 20
    iterator = 0
    kfold = KFold(n_splits=iter_num, shuffle=True)
    
    for train_base, val_base in kfold.split(base):
        train = pd.DataFrame(load_subsample_data(train_base, df),columns=df.columns)
        val = pd.DataFrame(load_subsample_data(val_base, df),columns=df.columns)
        train_ds = df_to_dataset(train, batch_size=batch_size)
        column_list = train.columns[0:-1]
        all_inputs = []
        encoded_features = []
        count = 0
        for header in column_list:
            count += 1
            print('Processed %d column out of %d' % (count, len(column_list)), end="\r", flush=True)
            numeric_col = tf.keras.Input(shape=(1,), name=header)
            normalization_layer = get_normalization_layer(header, train_ds)
            encoded_numeric_col = normalization_layer(numeric_col)
            all_inputs.append(numeric_col)
            encoded_features.append(encoded_numeric_col)

        print("")
        # Build model
        all_features = tf.keras.layers.concatenate(encoded_features)
        x = tf.keras.layers.Dense(128, activation="relu")(all_features)
        x = tf.keras.layers.Dense(128, activation="relu")(x)
        x = tf.keras.layers.Dropout(0.4)(x)
        output = tf.keras.layers.Dense(1)(x)
        val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
        iterator += 1
        print('Training cycle %d out of %d' % (iterator, iter_num), end="\r", flush=True)
        model = tf.keras.Model(all_inputs, output)
        model.compile(optimizer='adam',
                      loss=tf.keras.losses.BinaryCrossentropy(),
                      metrics=["accuracy"])
        
        # Callback to save best model
        checkpoint_val_acc = callbacks.ModelCheckpoint(
        'model_val_acc_'+str(iterator)+'.h5', monitor='val_accuracy', verbose=0, save_best_only=True,
        save_weights_only=False,  save_freq='epoch' )
        
        model.fit(train_ds, epochs=epoch_num, validation_data=val_ds, verbose=0,callbacks=[checkpoint_val_acc])
        df2 = pd.DataFrame()
        for part in ['loss', 'accuracy', 'val_loss', 'val_accuracy']: 
            df2[part] = model.history.history[part]
        df2.index.names = ['Epoch']
        df2.to_csv(save_path+'_training'+str(iterator)+'.csv')
        
        # Save best values
        best_loss.append(max(model.history.history['loss']))
        best_acc.append(max(model.history.history['accuracy']))
        best_loss_val.append(max(model.history.history['val_loss']))
        best_acc_val.append(max(model.history.history['val_accuracy']))
        print("")
    
    # Post training
    summary = model.summary # Every model is the same so the summary can be called post training loop
    df = pd.DataFrame()
    df['Best_Loss'] = best_loss
    df['Best_Acc'] = best_acc
    df['Best_Loss_Val'] = best_loss_val
    df['Best_Acc_Val'] = best_acc_val
    df.index.names = ['Model_Num']
    df.to_csv(save_path+'_best'+'.csv')
    return df, summary

In [6]:
df, summary = data_model('./csvs/eGeMAPS/ReadText_eGeMAPSv01b.csv', './model_testing/ReadText_eGeMAPSv01b/',
                         'Metrics', iter_num=8, epoch_num=100)


Processed 88 column out of 88
Training cycle 1 out of 8
Processed 88 column out of 88
Training cycle 2 out of 8
Processed 88 column out of 88
Training cycle 3 out of 8
Processed 88 column out of 88
Training cycle 4 out of 8
Processed 88 column out of 88
Training cycle 5 out of 8
Processed 88 column out of 88
Training cycle 6 out of 8
Processed 88 column out of 88
Training cycle 7 out of 8
Processed 88 column out of 88
Training cycle 8 out of 8


In [7]:
df

Unnamed: 0_level_0,Best_Loss,Best_Acc,Best_Loss_Val,Best_Acc_Val
Model_Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6.493167,1.0,6.169979,0.92
1,3.980464,1.0,3.867504,0.8
2,4.057045,0.99375,2.753671,0.96
3,5.829454,0.99375,4.551526,1.0
4,5.818818,0.99375,7.454044,0.8
5,6.210269,1.0,5.80001,0.85
6,2.220295,0.993939,9.029058,0.65
7,3.835428,1.0,2.503509,0.85


In [6]:
df, summary = data_model('./csvs/Whole files/WholeReadText_eGeMAPSv01b.csv', './whole/',
                         'Metrics', iter_num=5, epoch_num=100)# There are 7 sapmles in a set, so number of splits must be smaller
df

Processed 88 column out of 88
Training cycle 1 out of 5
Processed 88 column out of 88
Training cycle 2 out of 5
Processed 88 column out of 88
Training cycle 3 out of 5
Processed 88 column out of 88
Training cycle 4 out of 5
Processed 88 column out of 88
Training cycle 5 out of 5


Unnamed: 0_level_0,Best_Loss,Best_Acc,Best_Loss_Val,Best_Acc_Val
Model_Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4.50061,1.0,1.860439,0.9
1,5.799523,1.0,5.005357,1.0
2,6.878037,1.0,6.833271,0.8
3,5.320766,1.0,12.234532,0.6
4,5.726707,1.0,9.149543,0.8
