# Capstone Project - Speaker Diarisation
## Part 2 - VAD Application and Data Generation

In this part we now take a full audio recording and apply the VAD model to it.
We will use these flags to make chunks, but before we also need to make sure that these chunks are not longer than a natural human utterance or phrasing, as these chunks will be used to obtain the transcriptions.

In [1]:
import pandas as pd
import numpy as np
import scipy
import os
import joblib

import matplotlib.pyplot as plt
import seaborn as sns

import sktools
import importlib
import capstone

import IPython.display

import librosa
import librosa.display

import pydub
from pydub import AudioSegment
from pydub.utils import make_chunks 

In [2]:
#load model
model = joblib.load('pre_models/VAD.joblib')

In [3]:
#load audio, extract relevant features
path = 'data/VAD/2021-08-25_cut.wav'
audio = capstone.audio.feat_ext(path, sr=16000)

In [1]:
out_path = 'data/VAD/'
capstone.audio.spec_plotter(audio,'audio',sr=16000, save=True, out_path=out_path)

In [5]:
#obtaining predictions
preds = model.predict(audio['energy'].T)

In [2]:
preds_df = pd.DataFrame(preds, columns=['l'])
capstone.audio.mask_plotter(audio['energy'],preds_df, 'audio', save=True, wave=False, out_path=out_path, suffix='_masked')
IPython.display.Audio(audio['audio'], rate=16000)

In [7]:
#frames to time
target = preds.shape[0]
duration = librosa.frames_to_time(target, sr=16000, hop_length=512, n_fft=2048)
fR = target / duration
sR = librosa.frames_to_samples(target, hop_length=512, n_fft=2048) /target

In [8]:
#ratios and equivalences
phrase_min = 2.5 #min duration of a phrase (seconds)
phrase_max = 5 #max duration of a phrase
silence_sec = 0.5 #cut on silence
window=round(fR/3,0)

#phrase and silence durations, in frames
fpmin = int(phrase_min * fR) 
fpmax = int(phrase_max * fR)
silence_frame = int(silence_sec * fR) 

In [18]:
preds_df

Unnamed: 0,l
0,0
1,0
2,0
3,0
4,0
...,...
26372,1
26373,1
26374,1
26375,1


#### Condensing frame-level predictions into full vocal, human-phrases

In [19]:
df = preds_df.copy()
df['idx'] = preds_df.index
df['f'] = df['l'].shift()
df['cumsum'] = (df['f'] != df['l']).cumsum()

df2 = df.groupby('cumsum').agg(['min','max'])
df2 = df2[['l','idx']]

#first pass
df2 = capstone.audio.summer(df,idx=True)
#second pass
df2 = capstone.audio.summer(df2)
#cleanup small silences
df2 = df2[(df2['l'] == 0) & (df2['dur'] >silence_frame) | (df2['l'] == 1)]
#third pass
df2 = capstone.audio.summer(df2)
#2nd cleanup small phrases
df2 = df2[(df2['l'] == 1) & (df2['dur'] >fpmin) | (df2['l'] ==0)]
#fourth pass
df2 = capstone.audio.summer(df2)

# re-transform
df2['cumsum'] = (df2['dur'] > fpmin).cumsum()
df2 = df2.groupby('cumsum').agg(['min','max'])
df2

#cleanup
df3 = pd.DataFrame()
df3['l'] = df2['l']['min']+ df2['l']['max']
df3['min'] = df2['min']['min']
df3['max'] = df2['max']['max']
#re-calc deltas (durations)
df3['dur'] = df2['dur']['min'] + df2['dur']['max']
df3.reset_index(inplace=True)
df3
df2= df3.copy()
del df3
df2.drop('cumsum', axis=1, inplace=True)

# cleanup big silences to achieve pure audio
df2 = df2[(df2['l'] == 0) & (df2['dur'] <silence_frame) | (df2['l'] ==1) |(df2['l'] ==2)]

# DF formatting
df2.reset_index(inplace=True)
df2.drop('index', axis=1, inplace=True)
df2

Unnamed: 0,l,min,max,dur
0,1,35,292,256
1,1,293,610,316
2,1,611,982,370
3,1,983,1410,426
4,2,1411,1510,198
...,...,...,...,...
83,1,24635,25044,408
84,1,25045,25341,295
85,2,25342,25517,350
86,1,26045,26145,99


#### Transform Frame to time

In [10]:
df2['in'] = (df2['min'] /fR) * 1000
df2['out'] = (df2['max'] /fR) * 1000
df2

Unnamed: 0,l,min,max,dur,in,out
0,1,35,292,256,1120.084922,9344.708496
1,1,293,610,316,9376.710922,19521.480077
2,1,611,982,370,19553.482504,31426.382682
3,1,983,1410,426,31458.385108,45123.421162
4,2,1411,1510,198,45155.423589,48323.663798
...,...,...,...,...,...,...
83,1,24635,25044,408,788379.773287,801468.765667
84,1,25045,25341,295,801500.768093,810973.486295
85,2,25342,25517,350,811005.488721,816605.913334
86,1,26045,26145,99,833503.194450,836703.437085


### Export DF as timecode flags

In [14]:
name = path.split('/')[2].split('.')[0]
df2.to_csv('output/timecodes.csv')

### Chunk and export audio

In [12]:
capstone.audio.chunker(path,out_path,df2,fR)