In [99]:
import tensorflow as tf
import pandas as pd
import os
from pathlib import Path
import numpy as np
import tensorflow_hub as hub
import tensorflow_io as tfio

In [100]:
# !python3 -m pip install tensorflow-hub

In [101]:
tf.__version__

'2.5.0'

In [102]:
physical_devices = tf.config.list_physical_devices('GPU')
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [103]:
# load yamnet model
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

# vvgish model
vggish_model_handle = "https://tfhub.dev/google/vggish/1"
vggish_model = hub.load(vggish_model_handle)

In [104]:
# Utility functions for loading audio files and making sure the sample rate is correct.

@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

# Data creation

In [105]:
# read train and val data

train_df = pd.read_csv("first_impression_audio_train.csv")
test_df = pd.read_csv("first_impression_audio_val.csv")

In [106]:
from sklearn.model_selection import train_test_split


train_df, val_df = train_test_split(train_df, test_size=0.1)

In [107]:
train_df.reset_index(inplace=True)
val_df.reset_index(inplace=True)

In [108]:
# seperate x and ys i.e. data and latrain_test_split_name = train_df.file_path.values
file_name = train_df.file_path
targets = train_df[['openness',  'conscientiousness', 'extraversion', 'agreeableness','neuroticism']]

# openness = train_df['openness']
# cons = train_df['conscientiousness']
# extr = train_df['extraversion']
# agree = train_df['agreeableness']
# neur = train_df['neuroticism']


# val data
val_file_name = val_df.file_path
val_targets = val_df[['openness',  'conscientiousness', 'extraversion', 'agreeableness','neuroticism']]

# val_openness = val_df['openness']
# val_cons = val_df['conscientiousness']
# val_extr = val_df['extraversion']
# val_agree = val_df['agreeableness']
# val_neur = val_df['neuroticism']


# test data
test_file_name = test_df.file_path
test_targets = test_df[['openness',  'conscientiousness', 'extraversion', 'agreeableness','neuroticism']]

# test_openness = test_df['openness']
# test_cons = test_df['conscientiousness']
# test_extr = test_df['extraversion']
# test_agree = test_df['agreeableness']
# test_neur = test_df['neuroticism']


In [109]:
# np.reshape(test_targets[0], (-1, 5)).shape

In [110]:
# file_name

In [111]:
# def load_wav_for_map(filename, label):
#     return load_wav_16k_mono(filename), label
#     # return load_wav_16k_mono(filename), tf.reshape(label, (-1, 5))

    
def load_wav_for_map(filename, label1, label2, label3, label4, label5):
    return load_wav_16k_mono(filename), label1, label2, label3, label4, label5
    # return load_wav_16k_mono(filename), tf.reshape(label, (-1, 5))


def load_wav_for_map1(filename, label):
    return load_wav_16k_mono(filename), label

In [125]:
# create tf dataset
AUTO = tf.data.AUTOTUNE

train_ds = tf.data.Dataset.from_tensor_slices((file_name, targets))
train_ds = train_ds.map(load_wav_for_map1, AUTO)

# train_ds = tf.data.Dataset.from_tensor_slices((file_name, openness, cons, extr, agree, neur))
# train_ds = train_ds.map(load_wav_for_map, AUTO)


train_ds.element_spec

(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
 TensorSpec(shape=(5,), dtype=tf.float64, name=None))

In [126]:
val_ds = tf.data.Dataset.from_tensor_slices((val_file_name, val_targets))
val_ds = val_ds.map(load_wav_for_map1, AUTO)

# val_ds = tf.data.Dataset.from_tensor_slices((val_file_name, val_openness, val_cons, val_extr, val_agree, val_neur))
# val_ds = val_ds.map(load_wav_for_map, AUTO)


val_ds.element_spec

(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
 TensorSpec(shape=(5,), dtype=tf.float64, name=None))

In [127]:
test_ds = tf.data.Dataset.from_tensor_slices((test_file_name, test_targets))
test_ds = test_ds.map(load_wav_for_map1, AUTO)

# test_ds = tf.data.Dataset.from_tensor_slices((test_file_name, test_openness, test_cons, test_extr, test_agree, test_neur))
# test_ds = test_ds.map(load_wav_for_map, AUTO)


test_ds.element_spec

(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
 TensorSpec(shape=(5,), dtype=tf.float64, name=None))

In [128]:
for ele in train_ds:
    # print(extract_embedding(ele[0].numpy(), 1))
    print(ele[0])
    break

tf.Tensor(
[-2.1225111e-08  6.2852862e-08 -2.7310557e-08 ...  3.0617358e-03
  5.4587917e-03  9.6694697e-03], shape=(244831,), dtype=float32)


In [129]:
train_batch_size = val_batch_size = 8


train_steps = len(train_df)//train_batch_size
val_steps = len(val_df)//val_batch_size

In [130]:
# tf.config.run_functions_eagerly(True)

In [131]:
# applies the embedding extraction model to a wav data
def extract_embedding(wav_data, label):
    ''' run YAMNet to extract embedding from the wav data '''
    scores, embeddings, spectrogram = yamnet_model(wav_data)
    print(embeddings)
    
    # embeddings = tf.math.reduce_mean(embeddings, axis=0)
    
    num_embeddings = tf.shape(embeddings)[0]
    # print(tf.shape(embeddings)[0])
    
    # use reduce mean to calculate mean of array along axis 1 (mean of column value)
    return tf.math.reduce_mean(embeddings, axis=0), label


In [132]:
# # applies the embedding extraction model to a wav data
# def extract_embedding1(wav_data, label1, label2, label3, label4, label5):
#     ''' run YAMNet to extract embedding from the wav data '''
#     scores, embeddings, spectrogram = yamnet_model(wav_data)
#     print(embeddings)
    
#     embeddings = tf.math.reduce_mean(embeddings, axis=0)
#     num_embeddings = tf.shape(embeddings)[0]
#     print(tf.shape(embeddings)[0])
#     return (
#         embeddings,
#         # spectrogram,
#         # label
#         tf.repeat(label1, num_embeddings),
#         tf.repeat(label2, num_embeddings),
#         tf.repeat(label3, num_embeddings),
#         tf.repeat(label4, num_embeddings),
#         tf.repeat(label5, num_embeddings)
#            )


In [133]:
train_ds = train_ds.map(extract_embedding)
# train_ds = train_ds.map(extract_embedding1).unbatch()

train_ds.element_spec

Tensor("StatefulPartitionedCall:1", shape=(None, 1024), dtype=float32)


(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(5,), dtype=tf.float64, name=None))

In [134]:
val_ds = val_ds.map(extract_embedding)

# val_ds = val_ds.map(extract_embedding1).unbatch()
val_ds.element_spec

Tensor("StatefulPartitionedCall:1", shape=(None, 1024), dtype=float32)


(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(5,), dtype=tf.float64, name=None))

In [135]:
test_ds = test_ds.map(extract_embedding)

# test_ds = test_ds.map(extract_embedding1).unbatch()
test_ds.element_spec

Tensor("StatefulPartitionedCall:1", shape=(None, 1024), dtype=float32)


(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(5,), dtype=tf.float64, name=None))

In [136]:
# # stack labels again
# def concat_label(x, lb1, lb2, lb3, lb4, lb5):
#     return x, tf.stack([lb1, lb2, lb3, lb4, lb5])

In [137]:
# train_ds = train_ds.map(concat_label)
# val_ds = val_ds.map(concat_label)
# test_ds = test_ds.map(concat_label)

In [138]:
for ele in train_ds:
    print(ele[1])
    break

tf.Tensor([0.42222222 0.66990291 0.43925234 0.67032967 0.46875   ], shape=(5,), dtype=float64)


In [139]:
for ind,i in enumerate(train_ds):
    print(type(i[1]))
    break
    

<class 'tensorflow.python.framework.ops.EagerTensor'>


In [140]:
# # applies the embedding extraction model to a wav data
# def extract_embedding1(wav_data):
#     ''' run YAMNet to extract embedding from the wav data '''
#     scores, embeddings, spectrogram = yamnet_model(wav_data)
#     print(embeddings)
#     num_embeddings = tf.shape(embeddings)[0]
#     # print(tf.shape(embeddings)[0])
#     return (
#         embeddings,
#             # tf.repeat(label, num_embeddings)
#            )


In [141]:
# # split x and y and extract audio feactures
# train_ds1 = train_ds.map(lambda x,y: x)
# target_dataset = train_ds.map(lambda x,y: y)
# train_ds1 = train_ds1.map(extract_embedding1).unbatch()
# train_ds1 = tf.data.Dataset.zip((train_ds1, target_dataset))


# # val
# val_ds1 = val_ds.map(lambda x,y: x)
# val_target_dataset = val_ds.map(lambda x,y: y)

# # test
# test_ds1 = test_ds.map(lambda x,y: x)
# test_target_dataset = test_ds.map(lambda x,y: y)




In [142]:
# for ind, (x,y) in enumerate(train_ds1):
#     print(x[0])
#     print(f"y = {y[0]}")
#     if ind == 5:
#         break

In [143]:
# train_ds = train_ds.map(extract_embedding)
train_ds.element_spec

(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(5,), dtype=tf.float64, name=None))

In [144]:
# test_ds = test_ds.map(extract_embedding)
# val_ds = val_ds.map(extract_embedding)

In [145]:
# train_ds.filter(lambda x,y :x.shape[0])

In [146]:
# model.summary()

In [147]:
# train_ds1 = train_ds.unbatch().batch(train_batch_size)

In [148]:
# for j,i in enumerate(train_ds1):
#     print(i[0].shape)
#     print(i[0].numpy().shape)
#     # print(i[1].shape)
#     if j ==10:
#         break


In [149]:
train_ds = train_ds.shuffle(32).repeat().batch(train_batch_size).prefetch(AUTO)
val_ds = val_ds.batch(val_batch_size).prefetch(AUTO)
test_ds = test_ds.batch(val_batch_size).prefetch(AUTO)

In [150]:
# for i,j in train_ds.take(1):
#     print(i[0].shape)
#     # tf.keras.layers.GlobalAveragePooling2D(i[0])
#     break

In [151]:
base_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    # tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(5, activation='sigmoid')
], name='audio_personality')

# model.summary()





In [152]:
model = base_model

In [153]:
# class ReduceMeanLayer(tf.keras.layers.Layer):
#     def __init__(self, axis=0, **kwargs):
#         super(ReduceMeanLayer, self).__init__(**kwargs)
#         self.axis = axis
        
#     def call(self, input):
#         return tf.math.reduce_mean(input, axis=self.axis)

In [154]:
# input_segment = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='audio')
# embedding_extraction_layer = hub.KerasLayer(yamnet_model_handle,
#                                             trainable=False, name='yamnet')
# _, embeddings_output, _ = embedding_extraction_layer(input_segment)
# serving_outputs = base_model(embeddings_output)
# serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(serving_outputs)
# model = tf.keras.Model(input_segment, serving_outputs)

In [155]:
# for i,j in train_ds.take(1):
#     _, emb, _ = embedding_extraction_layer(i[0])
#     op=base_model(emb)
#     print(ReduceMeanLayer(axis=0, name='classifier')(op))
#     print(j[0])
#     break

# Custom callback to save model

In [156]:
def generate_output_dir(outdir, run_desc):
    prev_run_dirs = []
    if os.path.isdir(outdir):
        prev_run_dirs = [x for x in os.listdir(outdir) if os.path.isdir(\
            os.path.join(outdir, x))]
    prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs]
    prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None]
    cur_run_id = max(prev_run_ids, default=-1) + 1
    run_dir = os.path.join(outdir, f'{cur_run_id:05d}-{run_desc}')
    assert not os.path.exists(run_dir)
    os.makedirs(run_dir)
    return run_dir

In [157]:
import re
import os

outdir = "./checkpoint/audio_personality/"
if not os.path.isdir(outdir):
    os.makedirs(outdir)
    
run_desc = "audio_personality"

run_dir = generate_output_dir(outdir, run_desc)
print(f"Results saved to: {run_dir}")

Results saved to: ./checkpoint/audio_personality/00008-audio_personality


In [158]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import pickle


class MyModelCheckpoint(ModelCheckpoint):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch,logs)

        # Also save the optimizer state
        filepath = self._get_file_path(epoch=epoch, 
                                       logs=logs
                                       # ,batch=None
                                      )

        filepath = filepath.rsplit( ".", 1 )[ 0 ] 
        filepath += ".pkl"

        with open(filepath, 'wb') as fp:
            pickle.dump(
            {
                'opt': model.optimizer.get_config(),
                'epoch': epoch+1,
                'lr': model.optimizer.learning_rate
                
             # Add additional keys if you need to store more values
            }, fp, protocol=pickle.HIGHEST_PROTOCOL)
        print('\nEpoch %05d: saving optimizaer to %s' % (epoch + 1, filepath))

In [159]:
from tensorflow.keras.callbacks import ReduceLROnPlateau


# reduce lr on plateau
reduce_lr = ReduceLROnPlateau(monitor='val_mean_acc', factor=0.5, patience=2, 
                                   verbose=1, mode='min', min_lr=0.0000000001)


checkpoint = MyModelCheckpoint(os.path.join(run_dir, 'audio-personality-model-{epoch:02d}-{val_loss:.2f}.h5'),
        monitor='val_loss',verbose=1, save_best_only=True, mode='auto')
                              
callbacks_list = [checkpoint, reduce_lr]

In [160]:
import tensorflow.keras.backend as k
def mean_acc(y_true, y_pred):
    diff = k.abs(y_true - y_pred)
    return k.mean(1-diff)

In [161]:
model.compile(
    loss = ['mae'],
    # loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), 
    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0001), 
    metrics = [mean_acc]
)

In [162]:
model.summary()

Model: "audio_personality"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 2565      
Total params: 527,365
Trainable params: 527,365
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x= train_ds, steps_per_epoch=train_steps, 
               validation_data=val_ds,validation_steps=val_steps,
               epochs=50, callbacks=callbacks_list)

# model.fit(x= train_batches, steps_per_epoch=train_steps, 
#                validation_data=val_batches,validation_steps=val_steps,
#                epochs=50, callbacks=callbacks_list)

Epoch 1/50


2023-02-02 13:02:19.032132: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11


  1/675 [..............................] - ETA: 3:39:00 - loss: 0.1340 - mean_acc: 0.8660

2023-02-02 13:02:30.725627: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2023-02-02 13:02:30.725742: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.



Epoch 00001: val_loss improved from inf to 0.11233, saving model to ./checkpoint/audio_personality/00008-audio_personality/audio-personality-model-01-0.11.h5

Epoch 00001: saving optimizaer to ./checkpoint/audio_personality/00008-audio_personality/audio-personality-model-01-0.11.pkl
Epoch 2/50

Epoch 00002: val_loss improved from 0.11233 to 0.10918, saving model to ./checkpoint/audio_personality/00008-audio_personality/audio-personality-model-02-0.11.h5

Epoch 00002: saving optimizaer to ./checkpoint/audio_personality/00008-audio_personality/audio-personality-model-02-0.11.pkl
Epoch 3/50

Epoch 00003: val_loss improved from 0.10918 to 0.10866, saving model to ./checkpoint/audio_personality/00008-audio_personality/audio-personality-model-03-0.11.h5

Epoch 00003: saving optimizaer to ./checkpoint/audio_personality/00008-audio_personality/audio-personality-model-03-0.11.pkl

Epoch 00003: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
Epoch 4/50

Epoch 00004: val_loss 

In [None]:
# compiling into single model

class ReduceMeanLayer(tf.keras.layers.Layer):
    def __init__(self, axis=0, **kwargs):
        super(ReduceMeanLayer, self).__init__(**kwargs)
        self.axis = axis
        
    def call(self, input):
        return tf.math.reduce_mean(input, axis=self.axis)

In [None]:
model.evaluate(test_ds)

In [None]:
# saved_model_path = './audio_personality_yamnet'

# input_segment = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='audio')
# embedding_extraction_layer = hub.KerasLayer(yamnet_model_handle,
#                                             trainable=True, name='yamnet')
# _, embeddings_output, _ = embedding_extraction_layer(input_segment)
# serving_outputs = model(embeddings_output)
# serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(serving_outputs)
# serving_model = tf.keras.Model(input_segment, serving_outputs)
# # serving_model.save(saved_model_path, include_optimizer=False)

In [None]:
# serving_model(load_wav_16k_mono(test_df.file_path[10]))

In [None]:
# test_df.neuroticism[10]

In [None]:
# serving_model.compile(
#     loss = ['mae'],
#     # loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), 
#     optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0001), 
#     metrics = [mean_acc]
# )

In [None]:
# tr_ds = tf.data.Dataset.from_tensor_slices(train_df.file_path.to_list())

In [None]:
# tr_ds = tr_ds.map(load_wav_16k_mono)

In [None]:
# serving_model.fit(x=tr_ds, steps_per_epoch=train_steps, epochs=1)