-
Notifications
You must be signed in to change notification settings - Fork 1
/
split.py
138 lines (119 loc) · 6.93 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import argparse
import os
import pandas as pd
from tqdm import tqdm
from pydub import AudioSegment
import subprocess
import re
if __name__ == "__main__":
parser = argparse.ArgumentParser("""Extracts audio, and splits it into smaller files""")
parser.add_argument("--input", help="big audio file", required=True)
parser.add_argument("--audio_folder",
help="folder that will contain smaller the audio files", required=True)
parser.add_argument("--out_csv", help="name of the output csv file, that will contain the name of each smaller file and that is destined to be filled with their transcript", required=True)
parser.add_argument("--wav_args", help="list of arguments of the wav created files as string",
default="-acodec pcm_s16le -ac 1 -ar 16000")
parser.add_argument("--max_duration", help="maximum duration (in seconds) a clip can last", default=7, type=int)
parser.add_argument("--min_duration", help="maximum duration (in seconds) a clip can last", default=2, type=int)
parser.add_argument("--remove_bad_segments", action="store_true", help="set this argument if you want to automatically remove the sentences that do not seem to be spoken by the speaker of interest (which need to be specified using the 'speaker_segment' argument")
parser.add_argument("--speaker_segment", nargs=2, type=float,
help="start and end time of a sample spoken by a speaker (seconds)")
args = parser.parse_args()
if not os.path.exists(args.audio_folder):
os.makedirs(args.audio_folder)
params_list = [item for param in args.wav_args.split("-")[1:] for item in f"-{param}".split(" ")[:2]]
file_extension = os.path.splitext(args.input)[1][1:]
full_audio = AudioSegment.from_file(args.input, file_extension)
# find out long, medium and small silences
# (type, noise_tol, noise_dur) for long, medium and small silences
silence_params = [(2, -50, .5), (1, -35, .3), (0, -25, .15)]
silences = []
for sil_type, noise_tol, noise_dur in silence_params:
process = subprocess.run(['ffmpeg', '-i', args.input, '-af', f'silencedetect=noise={noise_tol}dB:d={noise_dur}', '-f', 'null', '-'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
curr_silences = re.findall("\[silencedetect .{17} silence_start: (\d+(?:.\d*))\\n\[silencedetect .{17} silence_end: (\d+(?:.\d*))", process.stderr)
silences.extend([(float(s[0]), float(s[1]), sil_type) for s in curr_silences])
silences.sort(key=lambda x: x[0])
# loads necessary data if we want to remove bad speakers
if args.remove_bad_segments:
from resemblyzer import normalize_volume, VoiceEncoder
from resemblyzer.hparams import sampling_rate, audio_norm_target_dBFS
from pydub.playback import play
import matplotlib.pyplot as plt
import librosa
import numpy as np
wav, source_sr = librosa.load(args.input, sr=None)
wav = librosa.resample(wav, source_sr, sampling_rate)
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
speaker_wav = wav[int(args.speaker_segment[0] * sampling_rate):int(args.speaker_segment[1] * sampling_rate)]
print("Playing the selected audio segment at given offsets to check it is alright")
audio = AudioSegment.from_wav(args.input)
play(audio[int(args.speaker_segment[0]*1000):int(args.speaker_segment[1]*1000)])
if input("Is this correct? (y/n)\n") != "y":
exit(0)
encoder = VoiceEncoder("cpu")
speaker_embed = encoder.embed_utterance(speaker_wav)
similarities = []
# we will loop through the silences and try to find silences smaller than args.max_duration seconds and bigger than one second greedily by trying to cut on the biggest silences. Thus we will skip the first and last audio sample but we don't care
sent_index, i, lost_seconds = 0, 0, 0
to_save = [] # (audio, file_name)
prog_bar = tqdm(total=len(silences))
while i < len(silences):
start_period = silences[i][0] + args.min_duration
end_period = silences[i][0] + args.max_duration
j, last_med_silence, last_short_silence, last_long_silence = 1, None, None, None
while i + j < len(silences) and silences[i+j][0] < start_period:
j += 1
while i + j < len(silences) and silences[i+j][0] < end_period:
if silences[i+j][2] == 0:
last_short_silence = j
elif silences[i+j][2] == 1:
last_med_silence = j
else:
last_long_silence = j
break
j += 1
if last_long_silence is None:
if last_med_silence is not None:
j = last_med_silence
elif last_short_silence is not None:
j = last_short_silence
else:
if i+1 < len(silences):
lost_seconds += (silences[i+1][0]+silences[i+1][1])/2 - (silences[i][0]+silences[i][1])/2
i += 1
prog_bar.update(i)
continue
sent_start = (silences[i][0] + silences[i][1]) / 2 # 50% of silence duration as a margin for safety, sec to ms
sent_end = (silences[i+j][0] + silences[i+j][1]) / 2
if args.remove_bad_segments:
sent_wav = wav[int(sent_start * sampling_rate):int(sent_end * sampling_rate)]
sent_embed = encoder.embed_utterance(sent_wav, rate=16)
similarities.append(sent_embed @ speaker_embed)
to_save.append((full_audio[sent_start*1000:sent_end*1000], f"sentence_{sent_index}.wav"))
i += j
sent_index += 1
prog_bar.update(j)
prog_bar.close()
print(f"{lost_seconds : .2f} seconds of audio were cutted")
# selects the similarity threshold at which we will remove audio
if args.remove_bad_segments:
print("Find a separation threshold on the histogram between speeches spoken by your speaker (closer to 1) and others (closer to 0). Then close the figure")
plt.hist(similarities, bins=50)
plt.title("histogram of the similarities (the higher the better)")
plt.show()
thr = -1
while thr < 0 or thr > 1:
str_thr = input("Please enter a valid threshold\n")
try:
thr = float(str_thr)
except ValueError as e:
print("Value provided was not a float!")
# saves the files
csv_file = {"file": []}
for i, (audio, file_name) in enumerate(to_save):
if args.remove_bad_segments and similarities[i] < thr:
continue
csv_file["file"].append(file_name)
audio.export(os.path.join(args.audio_folder, file_name), format="wav", parameters=params_list)
csv_file["sentence"] = [""] * len(csv_file["file"]) # adding an empty column
pd.DataFrame(csv_file).to_csv(args.out_csv, sep=";", index=False)