In [1]:
from sys import path
path.append(".")

# Intro

This notebook is just for some simple tests over the "impossible to interpret" scripts provided by the authors. I will try especially the data related part, since it is very weird the way they decided to handle it.

In [38]:
from repeat_src.utils.models import get_backbone_model
from os.path import join as join_paths
from numpy import expand_dims, array
from pandas import read_csv
from keras.preprocessing import image
from keras_vggface.utils import preprocess_input
from tqdm.notebook import tqdm
from typing import List
from glob import glob

def load_images(file_list: List[str], batch_size: int) -> array:
    if batch_size>len(file_list):
        batch_size=len(file_list)
        
    count: int = 0
    x: list = []
    for path in file_list:
        x_temp = image.load_img(path)
        x_temp = image.img_to_array(x_temp)
        x_temp = expand_dims(x_temp, axis=0)
        x_temp= preprocess_input(x_temp, version=2)
        
        count += 1
        x.append(x_temp)
        if count % batch_size == 0 and count != 0:
            x = array(x)
            x = x.reshape(batch_size, 256, 256, 3)
            yield x
            x = []
            

In [36]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [28]:
input_shape: tuple = (256, 256, 3)
batch_size: int = 16

In [4]:
model = get_backbone_model(input_shape=input_shape)

Downloading data from https://github.com/rcmalli/keras-vggface/releases/download/v2.0/rcmalli_vggface_tf_notop_vgg16.h5


In [21]:
# make a list of unique videos (join video and utterance from the table with ground truths)
validation_ground_truth_df = read_csv("../omg_ValidationVideos.csv")

video_id_list: list = validation_ground_truth_df.apply(
    lambda x: f"{x['video']}/{x['utterance'].split('.')[0]}", axis=1
).tolist()

paths_to_faces = [
    join_paths("/data/leonardo/OMGEmotionChallenge/Validation_Set/trimmed_faces/", video_id)
    for video_id in video_id_list
]



In [33]:
paths_to_faces = paths_to_faces[:3]

In [46]:
extracted_features: list = []
for current_video_path in tqdm(paths_to_faces, desc='Extracting features w/ VGG16-Face'):
    video_frames_paths = glob(join_paths(current_video_path, "*.png"))

    current_video_features = model.predict_generator(
        load_images(video_frames_paths, batch_size),
        (len(video_frames_paths) // batch_size) + 1,
        verbose=1,
    )
    extracted_features.append(current_video_features)
    # df_cnn=df_cnn[0:len(idict[k]),:]
    # df_cnn=pd.DataFrame(df_cnn)
    # df_cnn.to_csv(predict_path+k+'.csv',index=None)


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

# Checking feature extraction

In [1]:
import pandas as pd

In [7]:
example_features_extracted = pd.read_csv("./extracted_features/Validation_Set/0cf41d04d_1/utterance_28.csv")

In [8]:
example_features_extracted.values.shape

(304, 512)

The shape is `(frame_number, VGG_output_shape)`. However, the number of frame is slightly larger than the actual number, since it has to be a power of the batch size (in this case 16). I believe, when the number of frames is not enough, new frames are created from the last one.

# Audio feature extraction

In [1]:
from typing import List
from glob import glob
from opensmile import Smile, FeatureSet, FeatureLevel
from numpy import ndarray
from pandas import read_csv, DataFrame

def clean_index(current_idx: tuple) -> list:
    new_idx = current_idx[0].split('/')[-2:]
    new_idx[-1] = new_idx[-1].split(".")[0]
    return "/".join(new_idx)

audio_files: List[str] = glob("../Train_Set/audio/*/*.wav")
train_labels_path: str = "../omg_TrainVideos.csv"

# loading & extracting features
smile = Smile(
    feature_set=FeatureSet.ComParE_2016, feature_level=FeatureLevel.Functionals,
)
audio_features: DataFrame = smile.process_files(audio_files)
audio_features.index = audio_features.index.map(clean_index)

# loading labels
train_labels: DataFrame = read_csv(train_labels_path)
emotion_labels: ndarray = train_labels["EmotionMaxVote"]
train_labels.index = train_labels.apply(lambda x: f"{x['video']}/{x['utterance'].split('.')[0]}", axis=1)

# merging according to index
audio_features = audio_features.sort_index(inplace=False)
train_labels = train_labels.sort_index(inplace=False)

audio_features_with_labels = audio_features.merge(train_labels['EmotionMaxVote'], left_index=True, right_index=True)

I am repeating what they did: train an XGBoost classifier to recognize emotions (this is stated in the paper), and from there extract the 256 most important features.

In [32]:
x_train: ndarray = audio_features_with_labels.values[:,:-1]
y_train: ndarray = audio_features_with_labels.values[:,-1]

In [6]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=20, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
from numpy import arange, array
from pandas import Series
feature_importante_idx = arange(len(model.feature_importances_))
most_important_features_idx = array(Series(model.feature_importances_, index=feature_importante_idx).sort_values(ascending=False)[:256].index)
# 

In [29]:
x_train_most_important_features = x_train[:,most_important_features_idx]

In [41]:
DataFrame(x_train_most_important_features, index = audio_features_with_labels.index).to_csv("./features_extracted_audio/train_features.csv")


In [44]:
from numpy import save
save("./features_extracted_audio/importante_idx.npy", most_important_features_idx)