In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split
from os import listdir, makedirs
import time

In [3]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('targetValue')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()
    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])
    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)
    return normalizer

In [54]:
def data_model(path, save_path, save_name_start = 'train', whole_files = False, iter_num = 5, epoch_num =20):
    """
    Function that creates datasets out of csv files and normalizes them using Keras layer preprocessing. Then a model is 
    created     and trained multiple times. Then it's metrics are saved and their summary is printed in the console.
    
    Parameters:
    path - filepath of the csv document with features
    save_path - directory where model metrics will be saved
    save_name_start - beginning of each saved file
    whole_files - whether the model is trained on whole or split files
    iter_num - number of model training iterations
    iter_num - number of epochs
    
    Returns:
    df - Padnas DataFrame with best metrics from each cycle
    summary - model summary
    """
    try:
        makedirs(save_path)
    except:
        pass
    save_path = save_path+save_name_start
    file = path
    df = pd.read_csv(file)
    df.drop('Unnamed: 0', axis=1, inplace = True)
    if not whole_files:
        test_pd = df[0:10]
        test_hc = df[len(df)-10:len(df)]
        test = test_pd.append(test_hc)
        a = list(np.linspace(0,9,10).astype(int))
        df.drop(a, inplace=True)
        a = list(np.linspace(len(df)-10,len(df),10).astype(int))
        df.drop(a,inplace=True)
    train, val = train_test_split(df, test_size=0.3, random_state=55)
    batch_size = 20
    train_ds = df_to_dataset(train, batch_size=batch_size)
    column_list = train.columns[0:-1]
    all_inputs = []
    encoded_features = []
    count = 0
    for header in column_list:
        count += 1
        print('Processed %d column out of %d' % (count, len(column_list)), end="\r", flush=True)
        numeric_col = tf.keras.Input(shape=(1,), name=header)
        normalization_layer = get_normalization_layer(header, train_ds)
        encoded_numeric_col = normalization_layer(numeric_col)
        all_inputs.append(numeric_col)
        encoded_features.append(encoded_numeric_col)
        
    print("")
    # Build model
    all_features = tf.keras.layers.concatenate(encoded_features)
    x = tf.keras.layers.Dense(32, activation="relu")(all_features)
    x = tf.keras.layers.Dropout(0.4)(x)
    output = tf.keras.layers.Dense(1)(x)
    val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
    if not whole_files:
        test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
    best_loss = []
    best_acc = []
    best_loss_val = []
    best_acc_val = []
    if not whole_files:
        test_acc = []
        test_loss = []
    iterator = 0
    while iterator<iter_num:
        iterator += 1
        print('Training cycle %d out of %d' % (iterator, iter_num), end="\r", flush=True)
        model = tf.keras.Model(all_inputs, output)
        model.compile(optimizer='adam',
                      loss=tf.keras.losses.BinaryCrossentropy(),
                      metrics=["accuracy"])
        model.fit(train_ds, epochs=epoch_num, validation_data=val_ds, verbose=0)
        df = pd.DataFrame()
        for part in ['loss', 'accuracy', 'val_loss', 'val_accuracy']: 
            df[part] = model.history.history[part]
        df.index.names = ['Epoch']
        df.to_csv(save_path+'_training'+str(iterator)+'.csv')
        
        # Save best values
        best_loss.append(max(model.history.history['loss']))
        best_acc.append(max(model.history.history['accuracy']))
        best_loss_val.append(max(model.history.history['val_loss']))
        best_acc_val.append(max(model.history.history['val_accuracy']))
        if not whole_files:
            eval_metrics = model.evaluate(train_ds, verbose=0)
            test_acc.append(eval_metrics[1])
            test_loss.append(eval_metrics[0])
        summary = model.summary # Every model is the same so the summary can be called post training loop
        del model
    print("")
    # Post training
    df = pd.DataFrame()
    df['Best_Loss'] = best_loss
    df['Best_Acc'] = best_acc
    df['Best_Loss_Val'] = best_loss_val
    df['Best_Acc_Val'] = best_acc_val
    if not whole_files:
        df['Test_Loss'] = test_loss
        df['Test_Acc'] = test_acc
    df.index.names = ['Model_Num']
    df.to_csv(save_path+'_best'+'.csv')
    return df, summary

In [45]:
start_time = time.time()
df, summary = data_model('./csvs/GeMAPS/ReadText_GeMAPSv01b.csv', './train_outputs/ReadText_GeMAPSv01b/',
                         'Metrics', iter_num=3)
print('Elapsed time: ' , time.time() - start_time)
display(df)

Processed 62 column out of 62
Training cycle 3 out of 3
Elapsed time:  14.01936936378479


Unnamed: 0_level_0,Best_Loss,Best_Acc,Best_Loss_Val,Best_Acc_Val,Test_Loss,Test_Acc
Model_Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.103929,0.713043,3.850484,0.8,2.588799,0.704348
1,3.491644,0.843478,2.331122,0.78,0.442858,0.886957
2,1.975386,0.886957,1.223402,0.86,0.206845,0.956522


In [11]:
print(summary()) # It's in new cell because of the summary's size

Model: "functional_25"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
F0semitoneFrom27.5Hz_sma3nz_ame [(None, 1)]          0                                            
__________________________________________________________________________________________________
F0semitoneFrom27.5Hz_sma3nz_std [(None, 1)]          0                                            
__________________________________________________________________________________________________
F0semitoneFrom27.5Hz_sma3nz_per [(None, 1)]          0                                            
__________________________________________________________________________________________________
F0semitoneFrom27.5Hz_sma3nz_per [(None, 1)]          0                                            
______________________________________________________________________________________

In [22]:
# Compare set works much slower and there's something wrong with some names in it
# Returns "ValueError: 'audSpec_Rfilt_sma[0]_range' is not a valid scope name" error at 249 column
# start_time = time.time()
# df, summary = data_model('./csvs/ReadText_ComParE2016.csv', './csvs/train_outputs/ReadText_Compare/',
#                          'Metrics', iter_num=3)
# print('Elapsed time: ' , time.time() - start_time)
# display(df)

Processed 249 column out of 6373

ValueError: 'audSpec_Rfilt_sma[0]_range' is not a valid scope name

In [55]:
# Training models for GeMAPS and eGeMAPS csv datasets
start_time_full = time.time()
for directory in ['./csvs/GeMAPS/','./csvs/eGeMAPS/']:
    file_list = listdir(directory)
    print('File list:')
    print(file_list)
    print('Model training: ')
    for file in file_list:
        a = file[0:-4]
        print('')
        print('---------------------------------------------------------------------------------------------------------------')
        print('')
        print('\t \t \t Processing file ', a)
        print('')
        print('---------------------------------------------------------------------------------------------------------------')
        print('')
        start_time = time.time()
        df, summary = data_model(directory+file, './train_outputs/'+a+'/', 'Metrics')
        print('Elapsed time: ' , time.time() - start_time)

print('Full elapsed time: ' , time.time() - start_time_full)
    

File list:
['ReadText_GeMAPSv01b.csv', 'SpontaneousDialogue_GeMAPSv01b.csv']
Model training: 

---------------------------------------------------------------------------------------------------------------

	 	 	 Processing file  ReadText_GeMAPSv01b

---------------------------------------------------------------------------------------------------------------

Processed 62 column out of 62
Training cycle 5 out of 5
Elapsed time:  31.18152379989624

---------------------------------------------------------------------------------------------------------------

	 	 	 Processing file  SpontaneousDialogue_GeMAPSv01b

---------------------------------------------------------------------------------------------------------------

Processed 62 column out of 62
Training cycle 5 out of 5
Elapsed time:  33.37437415122986
File list:
['ReadText_eGeMAPSv01b.csv', 'SpontaneousDialogue_eGeMAPSv01b.csv']
Model training: 

------------------------------------------------------------------------------