pip install numpy  
pip install pandas  
pip install openpyxl  
pip install tqdm
pip install ffmpeg  
pip install pydub  
conda install -c conda-forge librosa  
pip install -U praat-parselmouth

In [None]:
import pydub
import pandas as pd
from tqdm import tqdm
import os

pydub.AudioSegment.converter = r"C:/Users/Kinza/anaconda3/envs/pie/Library/bin/ffmpeg.exe" #CHANGE THIS

# Data loading

In [None]:
def get_end_from_start(df_mail):
    end             = df_mail.iloc[1:,:]['start'].copy()
    end[len(end)+1] = -1
    df_mail['end']  = end.values
    return df_mail

def get_start_end_from_file(file):
    #Get start and end times
    df = pd.read_excel(file)
    df = df.rename(columns={'time':'start'}) 
    df['start'] = df['start']*1000 #in ms

    df = df.groupby('mail').apply(get_end_from_start)
    return df

def load_audios(video_folder,startend_file) :
    '''Loads the audios from each interview question.
    Arguments:
        video_folder : str. The name of the folder containing mp4 videos.
        startend_file : str. The name of the file containing the video informations. 
            Must contain columns 'mail' and 'time'.
    Returns a list of audios
    '''
    filenames = tqdm(os.listdir(video_folder))
    df_startend = get_start_end_from_file(startend_file)
    audios = list(map(lambda f : load_audio(video_folder,df_startend,f), filenames))
    return [item for sublist in audios for item in sublist]


In [None]:
from audio_feats_extract import load_audio

video_folder = 'videos/'
df_name = 'test.xlsx'

audios = load_audios(video_folder,df_name)

In [None]:
audios

In [None]:
feats = []
for audio in audios: 
    spectral_features          = audio.spectral_features
    spectral_features['stat']  = spectral_features.index
    spectral_features['index'] = 0
    spectral_features          = pd.pivot_table(spectral_features,index='index',columns='stat',values=spectral_features.columns[:-2],aggfunc='first')
    
    features = pd.concat([audio.prosodic_features, spectral_features],axis=1)
    features['id']    = audio.email+'_'+str(audio.question)
    features = features.set_index('id')
    features['email'] = audio.email
    features['question'] = audio.question

    feats.append(features)

feats = pd.concat(feats,axis=0)

feats.head()

In [None]:
import ipywidgets as widgets
from ipywidgets import Layout

email = widgets.Select(
    options = feats.index.tolist(),
    description='Interview',
    disabled=False,
    layout = Layout(width='50%', height='80px', display='flex')
)


variable = widgets.Select(
    options = feats.columns.tolist(),
    description='Interview',
    disabled=False,
    layout = Layout(width='50%', height='80px', display='flex')
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def print_(email,variable):
    voicecount = round(feats.loc[email,variable],3)

    sns.set_theme(style="whitegrid")
    ax = sns.boxplot(y=variable, data=feats)
    ax.axhline(voicecount,c='r')

    plt.text(-0.3, voicecount*1.005, voicecount, horizontalalignment='left', size='small', color='red', weight='normal')


widgets.interactive(print_,email=email,variable=variable)

# Load features

In [None]:
import pandas as pd

all_features = []
for (i,audio) in enumerate(audios):
    features = pd.concat([audio.pauses_features,audio.spectral_features,audio.prosodic_features])
    features['email'] = audio.email
    features['question'] = audio.question

    features['temp'] = features.index
    features = features.pivot_table(index='timestamp', columns='temp', values=features.columns[-3], aggfunc='first')

    all_features = all_features.append(features)

all_features = pd.concat(all_features)

# PCA

In [None]:
from pca import pca

model = pca(n_components=2)

# Fit transform
features_pca = model.fit_transform(features.drop('classes',axis=1))

print(model.compute_topfeat())

# Scatter first 2 PCs
fig, ax = model.scatter(legend=False)

# Make biplot with the number of features
fig, ax = model.biplot(n_feat=10,legend=False)

import matplotlib.pyplot as plt
plt.scatter(features_pca['PC']['PC1'],features_pca['PC']['PC2'],c=features['classes'])