### prepare dataset

In [13]:
import pandas as pd
import os
from pydub import AudioSegment

In [2]:
female = os.listdir("data/VoxCeleb_gender/females")
male = os.listdir("data/VoxCeleb_gender/males")

In [3]:
df_female = pd.DataFrame(female, columns=["file_name"])
df_male = pd.DataFrame(male, columns=["file_name"])
df_female["label"] = 0
df_male["label"] = 1

### lets analyze the data

In [14]:
def get_duration(row, folder):
    audio_path = os.path.join("data/VoxCeleb_gender", folder, row["file_name"])
    if ".m4a" in audio_path:
        audio_segments = AudioSegment.from_file(audio_path, format="m4a")
        return audio_segments.duration_seconds
    else:
        return 0

df_female["duration"] = df_female.apply(get_duration, axis=1, folder="females")
df_male["duration"] = df_male.apply(get_duration, axis=1, folder="males")

In [15]:
df_female.head()

Unnamed: 0,file_name,label,duration
0,.DS_Store,female,0.0
1,0.m4a,female,11.264
2,1.m4a,female,4.608
3,10.m4a,female,4.096
4,100.m4a,female,6.784


In [16]:
print(f"female duration: {df_female['duration'].sum() / 3600} hours")
print(f"male duration: {df_male['duration'].sum() / 3600} hours")

female duration: 4.594773333333333 hours
male duration: 7.050862222222222 hours


In [18]:
print(f"number of female audio files: {len(df_female)}")
print(f"number of male audio files: {len(df_male)}")

number of female audio files: 2312
number of male audio files: 3683


In [19]:
df_female = df_female[df_female["duration"] > 0]
df_male = df_male[df_male["duration"] > 0]
print(f"number of female audio files: {len(df_female)}")
print(f"number of male audio files: {len(df_male)}")

number of female audio files: 2311
number of male audio files: 3682


In [22]:
df = pd.concat([df_female, df_male])
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [29]:
print(f"max audio length: {df['duration'].max()} seconds")

max audio length: 91.008 seconds


In [30]:
print(f"min audio length: {df['duration'].min()} seconds")

min audio length: 3.968 seconds


In [37]:
print(f"audio > 20 seconds: {len(df[df['duration'] > 20])}")

audio > 20 seconds: 124


In [53]:
female_path = "/home/mahmoud/data/Projects/cyshield/gender_classifier/data/VoxCeleb_gender/females/"
male_path = "/home/mahmoud/data/Projects/cyshield/gender_classifier/data/VoxCeleb_gender/males/"

def add_audio_path(row):
    if row["label"] == 0:
        return os.path.join(female_path, row["file_name"])
    else:
        return os.path.join(male_path, row["file_name"])

df["audio_filepath"] = df.apply(add_audio_path, axis=1)
df.head()

Unnamed: 0,file_name,label,duration,audio_filepath
0,2674.wav,1,7.488,/home/mahmoud/data/Projects/cyshield/gender_cl...
1,516.wav,0,5.696,/home/mahmoud/data/Projects/cyshield/gender_cl...
2,2275.wav,1,4.16,/home/mahmoud/data/Projects/cyshield/gender_cl...
3,2264.wav,1,4.608,/home/mahmoud/data/Projects/cyshield/gender_cl...
4,1112.wav,0,5.568,/home/mahmoud/data/Projects/cyshield/gender_cl...


### conclusion

- the data is multilangual -> so i will use multilangual encoder
- the data may contain long silence
- number of female audio files: 2311
- number of male audio files: 3682
- female duration: 4.59 hours
- male duration: 7.05 hours

### data cleaning
- i will remove long silence from training data if exists
- Why ?
- because this what the model will see on production => "audios without long silence" 

In [49]:
import torch
import numpy as np
from pydub import AudioSegment
import os


vad_model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=False,
    onnx=False
)
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils


os.makedirs("data/VoxCeleb_gender/females", exist_ok=True)
os.makedirs("data/VoxCeleb_gender/males", exist_ok=True)

def detect_speech_segments(row):
    
    audio = AudioSegment.from_file(row["audio_filepath"])
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    name = row["file_name"].split(".")[0]
    
    samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
    if audio.sample_width == 2:  
        samples = samples / 32768.0
    
    
    wav_tensor = torch.from_numpy(samples)
    
    
    speech_timestamps = get_speech_timestamps(
        wav_tensor,
        vad_model,
        threshold=0.5,
        sampling_rate=16000,
        return_seconds=True,
        min_speech_duration_ms=250,        
        min_silence_duration_ms=300,
        speech_pad_ms=50
    )
    
    merged_segments = []
    
    for segment in speech_timestamps:
        start = segment['start']
        end = segment['end']
        
        start_sample = int(start * 16000)
        end_sample = int(end * 16000)
        segment_tensor = wav_tensor[start_sample:end_sample]
        
        merged_segments.append(segment_tensor)
    
    if merged_segments:
        merged_tensor = torch.cat(merged_segments, dim=0)
        
        if row["label"] == 0:
            save_audio(f"data/VoxCeleb_gender/females/{name}.wav", 
                      merged_tensor, 16000)
        else:
            save_audio(f"data/VoxCeleb_gender/males/{name}.wav", 
                      merged_tensor, 16000)

df.apply(detect_speech_segments, axis=1)

Using cache found in /home/mahmoud/.cache/torch/hub/snakers4_silero-vad_master


0       None
1       None
2       None
3       None
4       None
        ... 
5988    None
5989    None
5990    None
5991    None
5992    None
Length: 5993, dtype: object

### Create the full ddataset

In [50]:
df.head()

Unnamed: 0,file_name,label,duration,audio_filepath
0,2674.m4a,1,7.488,/home/mahmoud/data/Projects/cyshield/gender_cl...
1,516.m4a,0,5.696,/home/mahmoud/data/Projects/cyshield/gender_cl...
2,2275.m4a,1,4.16,/home/mahmoud/data/Projects/cyshield/gender_cl...
3,2264.m4a,1,4.608,/home/mahmoud/data/Projects/cyshield/gender_cl...
4,1112.m4a,0,5.568,/home/mahmoud/data/Projects/cyshield/gender_cl...


In [51]:
def replace_file_name_with_wav(row):
    return row["file_name"].replace("m4a", "wav")
    
def replace_audio_filepath(row):
    return row["audio_filepath"].replace("w4a", "wav")

df["file_name"] = df.apply(replace_file_name_with_wav, axis=1)
df["audio_filepath"] = df.apply(replace_audio_filepath, axis=1)

In [54]:
df.head()

Unnamed: 0,file_name,label,duration,audio_filepath
0,2674.wav,1,7.488,/home/mahmoud/data/Projects/cyshield/gender_cl...
1,516.wav,0,5.696,/home/mahmoud/data/Projects/cyshield/gender_cl...
2,2275.wav,1,4.16,/home/mahmoud/data/Projects/cyshield/gender_cl...
3,2264.wav,1,4.608,/home/mahmoud/data/Projects/cyshield/gender_cl...
4,1112.wav,0,5.568,/home/mahmoud/data/Projects/cyshield/gender_cl...


In [55]:
df.to_csv("data/VoxCeleb_gender_dataset.csv", index=False)

In [2]:
from IPython.display import Audio

Audio("/home/mahmoud/data/Projects/cyshield/gender_classifier/data/VoxCeleb_gender/females/1266.wav")

In [3]:
Audio("/home/mahmoud/data/Projects/cyshield/gender_classifier/data/VoxCeleb_gender/females/1266.m4a")