# Extraction of prosody features from audio files

Compute prosody features from continuous speech.

103 features are computed:

Num     Feature                                                          Description

--------------------------------------------------------------------------------------------------------------------------
                                Features based on F0
                                
---------------------------------------------------------------------------------------------------------------------------
1-6     F0-contour                                                       Avg., Std., Max., Min., Skewness, Kurtosis

7-12    Tilt of a linear estimation of F0 for each voiced segment        Avg., Std., Max., Min., Skewness, Kurtosis

13-18   MSE of a linear estimation of F0 for each voiced segment         Avg., Std., Max., Min., Skewness, Kurtosis

19-24   F0 on the first voiced segment                                   Avg., Std., Max., Min., Skewness, Kurtosis

25-30   F0 on the last voiced segment                                    Avg., Std., Max., Min., Skewness, Kurtosis

--------------------------------------------------------------------------------------------------------------------------
                                Features based on energy
                                
---------------------------------------------------------------------------------------------------------------------------
31-34   energy-contour for voiced segments                               Avg., Std., Skewness, Kurtosis

35-38   Tilt of a linear estimation of energy contour for V segments     Avg., Std., Skewness, Kurtosis

39-42   MSE of a linear estimation of energy contour for V segment       Avg., Std., Skewness, Kurtosis

43-48   energy on the first voiced segment                               Avg., Std., Max., Min., Skewness, Kurtosis

49-54   energy on the last voiced segment                                Avg., Std., Max., Min., Skewness, Kurtosis

55-58   energy-contour for unvoiced segments                             Avg., Std., Skewness, Kurtosis

59-62   Tilt of a linear estimation of energy contour for U segments     Avg., Std., Skewness, Kurtosis

63-66   MSE of a linear estimation of energy contour for U segments      Avg., Std., Skewness, Kurtosis

67-72   energy on the first unvoiced segment                             Avg., Std., Max., Min., Skewness, Kurtosis

73-78   energy on the last unvoiced segment                              Avg., Std., Max., Min., Skewness, Kurtosis

--------------------------------------------------------------------------------------------------------------------------
                                Features based on duration
                                
---------------------------------------------------------------------------------------------------------------------------
79      Voiced rate                                                      Number of voiced segments per second

80-85   Duration of Voiced                                               Avg., Std., Max., Min., Skewness, Kurtosis

86-91   Duration of Unvoiced                                             Avg., Std., Max., Min., Skewness, Kurtosis

92-97   Duration of Pauses                                               Avg., Std., Max., Min., Skewness, Kurtosis

98-103  Duration ratios                                                  Pause/(Voiced+Unvoiced), Pause/Unvoiced, Unvoiced/(Voiced+Unvoiced),
                                                                         Voiced/(Voiced+Unvoiced), Voiced/Puase, Unvoiced/Pause

---------------------------------------------------------------------------------------------------------------------------



In [1]:
import os
from tempfile import TemporaryDirectory

from disvoice import Prosody

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [2]:
audio_path = os.environ['PROJECT_DIR'] + '/audios/OSR_us_000_0030_8k.wav'

In [3]:
import logging
import matplotlib.font_manager as fm

# Suppress font warnings
logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)

with TemporaryDirectory() as temp_dir:
    prosody = Prosody(temp_dir=temp_dir)
    features_static = prosody.extract_features_file(audio_path, static=True, plots=False, fmt="dataframe")
    features_dynamic = prosody.extract_features_file(audio_path, static=False, plots=False, fmt="dataframe")

  z = np.poly1d(np.polyfit(x, temp, self.P))
  z = np.poly1d(np.polyfit(x,temp,self.P))
  z = np.poly1d(np.polyfit(x, temp, self.P))
  z = np.poly1d(np.polyfit(x, temp, self.P))
  z = np.poly1d(np.polyfit(x, temp, self.P))
  z = np.poly1d(np.polyfit(x, temp, self.P))
  z = np.poly1d(np.polyfit(x, temp, self.P))


In [4]:
features_static

Unnamed: 0,F0avg,F0std,F0max,F0min,F0skew,F0kurt,F0tiltavg,F0mseavg,F0tiltstd,F0msestd,...,skwdurpause,kurtosisdurpause,maxdurpause,mindurpause,PVU,PU,UVU,VVU,VP,UP
0,122.108879,19.427275,208.656616,67.108002,0.214233,1.135424,-181.134477,48.453969,357.424769,51.807607,...,1.377165,0.635782,1.99,0.15,1.733401,8.182648,0.211839,0.788161,0.454691,0.12221


In [5]:
features_dynamic

Unnamed: 0,f0coef0,f0coef1,f0coef2,f0coef3,f0coef4,f0coef5,Ecoef0,Ecoef1,Ecoef2,Ecoef3,Ecoef4,Ecoef5,Voiced duration
0,0.000514,-0.022341,0.333319,-2.090766,4.904121,148.443363,0.000289,-0.011983,0.161370,-0.891048,1.986971,-13.965110,0.19
1,0.224455,-3.219138,16.672700,-36.858273,26.925481,137.664886,0.006201,-0.000811,-0.056076,-0.142920,1.280333,-17.114412,0.05
2,-0.000205,0.005146,-0.085728,1.330220,-8.274472,153.992746,-0.000092,-0.001261,0.045973,-0.322942,1.469211,-15.200508,0.14
3,0.463005,-1.938367,-6.807973,39.900026,-61.640280,186.884216,-0.031852,-0.061666,-0.114587,-0.191070,-0.229642,-17.332361,0.04
4,0.000001,-0.000110,0.001322,0.167368,-5.090477,147.661255,0.000002,-0.000361,0.020001,-0.448267,3.370121,-22.617502,0.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,-0.101662,1.947374,-13.736980,43.438976,-62.240968,157.617035,0.004808,-0.042876,-0.107717,1.007631,-0.114260,-22.870469,0.06
62,0.001613,-0.019266,-0.338776,6.175901,-27.376580,155.284162,0.004116,-0.116427,1.214983,-5.245035,4.154780,-20.438142,0.11
63,-0.000497,-0.001733,0.176048,-0.999192,-1.458128,137.430051,-0.010313,0.242500,-1.933337,5.674683,-5.913997,-17.935622,0.11
64,0.008470,-0.213535,1.800361,-5.296870,-0.545311,132.148213,0.003815,-0.087560,0.692724,-2.146286,0.614116,-19.851226,0.10
