In [1]:
import os
import librosa
import numpy as np
import csv
import torch
import random

min_frames = 63 # calculated minimum number of frames in audio files 
n_mfcc = 39
csv_file_save = './csv/features_cnn.csv'

calculate_csv = None

if not os.path.exists('./csv'):
    os.makedirs('./csv')

if os.path.exists(csv_file_save):
    print ("File " + csv_file_save + " exists")
    print ("Features won't be recalculated")
    calculate_csv = False
else:
    print ("File " + csv_file_save + " doesn't exists")
    print ("Features will be calculated")
    calculate_csv = True

File ./csv/features_cnn.csv doesn't exists
Features will be calculated


In [2]:
# extract info speakers

file_speaker = "./LibriSpeech/SPEAKERS.TXT"
speakers = []

f = open(file_speaker, "r")
for line in f:
    if line[0] == ";":
        continue
    else:
        speakers.append( str.rstrip(line) ) # rstrip to remove \n at the end of the string

In [3]:
speakers[:10]

['14   | F | train-clean-360  | 25.03 | Kristin LeMoine',
 '16   | F | train-clean-360  | 25.11 | Alys AtteWater',
 '17   | M | train-clean-360  | 25.04 | Gord Mackenzie',
 '19   | F | train-clean-100  | 25.19 | Kara Shallenberg',
 '20   | F | train-other-500  | 30.07 | Gesine',
 '22   | F | train-clean-360  | 25.14 | Michelle Crandall',
 '23   | F | train-clean-360  | 25.23 | Anita Roy Dobbs',
 '25   | M | train-other-500  | 30.16 | John Gonzalez',
 '26   | M | train-clean-100  | 25.08 | Denny Sayers',
 '27   | M | train-clean-100  | 20.14 | Sean McKinley']

In [4]:
# extract speakers gender in dictionary

dict_speakers = {}
list_speakers = []
for speaker in speakers:
    speaker_split = speaker.split()
    speaker_split = [word for word in speaker_split if word != "|"]
    
    # indexes = 0 : id, 1 : gender, 2 : dataset
    if speaker_split[2] == "dev-clean":
        dict_speakers[speaker_split[0]] = speaker_split[1]
        list_speakers.append(speaker_split[0])
    else:
        continue

In [5]:
dict_speakers

{'84': 'F',
 '174': 'M',
 '251': 'M',
 '422': 'M',
 '652': 'M',
 '777': 'M',
 '1272': 'M',
 '1462': 'F',
 '1673': 'F',
 '1919': 'F',
 '1988': 'F',
 '1993': 'F',
 '2035': 'F',
 '2078': 'M',
 '2086': 'M',
 '2277': 'F',
 '2412': 'F',
 '2428': 'M',
 '2803': 'M',
 '2902': 'M',
 '3000': 'M',
 '3081': 'F',
 '3170': 'M',
 '3536': 'F',
 '3576': 'F',
 '3752': 'M',
 '3853': 'F',
 '5338': 'F',
 '5536': 'M',
 '5694': 'M',
 '5895': 'F',
 '6241': 'M',
 '6295': 'M',
 '6313': 'F',
 '6319': 'F',
 '6345': 'F',
 '7850': 'F',
 '7976': 'M',
 '8297': 'M',
 '8842': 'F'}

In [6]:
# get audio files info : name and path 

if calculate_csv:
    audio_files = []

    root = "./"
    path = os.path.join(root, "LibriSpeech")

    for (dirpath, dirnames, filenames) in os.walk(path):
        for file in filenames:
            if file[-5:] == ".flac":
                audio_files.append({"dirpath": dirpath, "filename": file})

In [7]:
if calculate_csv:
    print(len(audio_files))

2703


In [8]:
if calculate_csv:
    print(audio_files[:10])

[{'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0000.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0001.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0002.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0003.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0004.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0005.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0006.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0007.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0008.flac'}, {'dirpath': './LibriSpeech\\dev-clean\\1272\\128104', 'filename': '1272-128104-0009.flac'}]


In [9]:
# get labels genders (target) for each file 

if calculate_csv:
    gender_labels = []

    for audio_file in audio_files:
        file = audio_file["filename"]
        index = 0
        for char in file:
            if char != "-":
                index += 1
            else:
                break
        id_speaker = file[:index]
        audio_file["id_speaker"] = id_speaker

        gender_labels.append(dict_speakers[id_speaker])

In [10]:
if calculate_csv:
    print(len(gender_labels))

2703


In [11]:
# extract features

if calculate_csv:

    input_features = []

    counter = 0

    min_frames = 1000000

    for audio_file in audio_files:

        counter += 1
        if counter % 100 == 0:
            print("Extracting features file " + str(counter) + "/" + str(len(audio_files)))

        directory = audio_file["dirpath"]
        filename = audio_file["filename"]

        y, sr = librosa.load(directory + "/" + filename)
        hop_length = 512
        mfccs = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc)

        length = len(mfccs[0])
        if length < min_frames:
            min_frames = length 

        input_features.append(mfccs)

    print('Min number of frames = ' + str(min_frames))
    print("End of extracting features")

Extracting features file 100/2703
Extracting features file 200/2703
Extracting features file 300/2703
Extracting features file 400/2703
Extracting features file 500/2703
Extracting features file 600/2703
Extracting features file 700/2703
Extracting features file 800/2703
Extracting features file 900/2703
Extracting features file 1000/2703
Extracting features file 1100/2703
Extracting features file 1200/2703
Extracting features file 1300/2703
Extracting features file 1400/2703
Extracting features file 1500/2703
Extracting features file 1600/2703
Extracting features file 1700/2703
Extracting features file 1800/2703
Extracting features file 1900/2703
Extracting features file 2000/2703
Extracting features file 2100/2703
Extracting features file 2200/2703
Extracting features file 2300/2703
Extracting features file 2400/2703
Extracting features file 2500/2703
Extracting features file 2600/2703
Extracting features file 2700/2703
Min number of frames = 63
End of extracting features


In [12]:
# save on csv datas and labels

if calculate_csv:
    with open(csv_file_save, 'w', newline='') as csvfile:
        feat_writer = csv.writer(csvfile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)

        index = 0

        row_to_write = ['label', 'label_value', 'id_speaker']
        for f in range(n_mfcc):
            for fr in range(min_frames):
                row_to_write.append('f' + str(f) + 'n' + str(fr))

        feat_writer.writerow(row_to_write)

        for input_feature in input_features:

            class_label = gender_labels[index]
            if class_label == "M":
                class_label = 0
            else:
                class_label = 1

            row_to_write = [class_label, gender_labels[index], audio_files[index]["id_speaker"]]

            index += 1

            if index % 500 == 0:
                print("Writing row " + str(index) + "/" + str(len(audio_files)))
            for features in input_feature:
                for frame in range(min_frames):
                    row_to_write.append(features[frame])

            feat_writer.writerow(row_to_write)
    print("End of writing csv")

Writing row 500/2703
Writing row 1000/2703
Writing row 1500/2703
Writing row 2000/2703
Writing row 2500/2703
End of writing csv


In [13]:
# choose speakers to put in each dataset
"""
random.shuffle(list_speakers)

speakers_train = []
speakers_val   = []
speakers_test  = []
count_males   = 0
count_females = 0 
n_speakers = len(list_speakers)
n_speakers_for_gender_train = (n_speakers / 2) * 0.7
n_speakers_for_gender_train_val = (n_speakers / 2) * 0.85

for speaker in list_speakers:
    gender = dict_speakers[speaker]
    if gender == 'M':
        if count_males < n_speakers_for_gender_train: 
            speakers_train.append(int(speaker))
        elif count_males < n_speakers_for_gender_train_val: 
            speakers_val.append(int(speaker))
        else:
            speakers_test.append(int(speaker))
        count_males += 1
    elif gender == 'F':
        if count_females < n_speakers_for_gender_train: 
            speakers_train.append(int(speaker))
        elif count_females < n_speakers_for_gender_train_val: 
            speakers_val.append(int(speaker))
        else:
            speakers_test.append(int(speaker))
        count_females += 1

"""
# to set group of speakers as the other notebook 
speakers_train = [1272, 1673, 3752, 7850, 3000, 8842, 2078, 84, 2277, 3081, 422, 5338, 1462, 7976, 251, 6319, 652, 3170, 6313, 2428, 6345, 777, 8297, 6295, 5536, 3853, 3576, 2412]
speakers_val = [6241, 2803, 2902, 3536, 1988, 2035]
speakers_test = [174, 2086, 1993, 5895, 5694, 1919]

print(speakers_train)
print(speakers_val)
print(speakers_test)

[1272, 1673, 3752, 7850, 3000, 8842, 2078, 84, 2277, 3081, 422, 5338, 1462, 7976, 251, 6319, 652, 3170, 6313, 2428, 6345, 777, 8297, 6295, 5536, 3853, 3576, 2412]
[6241, 2803, 2902, 3536, 1988, 2035]
[174, 2086, 1993, 5895, 5694, 1919]


In [14]:
# read csv

my_data = np.genfromtxt(csv_file_save, delimiter=',', skip_header=1)

labels_train = []
data_train   = []
labels_test  = []
data_test    = []
labels_val   = []
data_val     = []

for row in my_data:
    features_to_append = []
    features = row[3:]
    
    for index in range(n_mfcc):
        features_to_append.append(features[min_frames * index : min_frames * (index + 1)])
    
    if row[2] in speakers_train:
        labels_train.append([row[0]])
        data_train.append(features_to_append)
        
    elif row[2] in speakers_val:
        labels_val.append([row[0]])
        data_val.append(features_to_append)
    
    else:
        labels_test.append([row[0]])
        data_test.append(features_to_append)
    

print(np.array(labels_train).shape)
print(np.array(data_train).shape)
print(np.array(labels_val).shape)
print(np.array(data_val).shape)
print(np.array(labels_test).shape)
print(np.array(data_test).shape)
    
data_train = list(zip(labels_train, data_train))
random.shuffle(data_train)
labels_train, data_train = zip(*data_train)

data_val = list(zip(labels_val, data_val))
random.shuffle(data_val)
labels_val, data_val = zip(*data_val)

data_test = list(zip(labels_test, data_test))
random.shuffle(data_test)
labels_test, data_test = zip(*data_test)
        
labels_train = np.array(labels_train)
data_train   = np.array(data_train)
labels_val   = np.array(labels_val)
data_val     = np.array(data_val)
labels_test  = np.array(labels_test)
data_test    = np.array(data_test)

print(labels_train.shape)
print(data_train.shape)
print(labels_val.shape)
print(data_val.shape)
print(labels_test.shape)
print(data_test.shape)

(1915, 1)
(1915, 39, 63)
(389, 1)
(389, 39, 63)
(399, 1)
(399, 39, 63)
(1915, 1)
(1915, 39, 63)
(389, 1)
(389, 39, 63)
(399, 1)
(399, 39, 63)


In [15]:
# cast data to tensor

data_train_tensor   = torch.from_numpy(data_train).float()
labels_train_tensor = torch.from_numpy(labels_train).float()
data_val_tensor     = torch.from_numpy(data_val).float()
labels_val_tensor   = torch.from_numpy(labels_val).float()
data_test_tensor    = torch.from_numpy(data_test).float()
labels_test_tensor  = torch.from_numpy(labels_test).float()

print(data_train_tensor.shape, labels_train_tensor.shape)
print(data_val_tensor.shape, labels_val_tensor.shape)
print(data_test_tensor.shape, labels_test_tensor.shape)

torch.Size([1915, 39, 63]) torch.Size([1915, 1])
torch.Size([389, 39, 63]) torch.Size([389, 1])
torch.Size([399, 39, 63]) torch.Size([399, 1])


In [16]:
# Model implementation and results

hidden_neurons = 64
n_outputs = 1
num_epochs = 10000
filters = 16
path_model = 'model_cnn.pth.tar'

class Flatten(torch.nn.Module):
    def forward(self, x):
        return x.view(x.size()[0], -1)

model = torch.nn.Sequential(
    torch.nn.Conv1d(n_mfcc, n_mfcc, 3, stride=2),
    torch.nn.MaxPool1d(3, stride=2),
    torch.nn.Conv1d(n_mfcc, n_mfcc, 3, stride=2),
    torch.nn.MaxPool1d(3, stride=2),
    Flatten(),
    torch.nn.Linear(117, hidden_neurons),
    torch.nn.ReLU(),
    torch.nn.Dropout(p = 0.2),
    torch.nn.Linear(hidden_neurons, hidden_neurons),
    torch.nn.ReLU(),
    torch.nn.Dropout(p = 0.2),
    torch.nn.Linear(hidden_neurons, n_outputs),
    torch.nn.Sigmoid()
)


criterion = torch.nn.BCELoss(reduction='sum')

learning_rate = 7e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

minimum = 0
best_model = None

no_improve = 0
early_stopping_steps = 29

for epoch in range(num_epochs):
    y_pred = model(data_train_tensor)

    loss = criterion(y_pred, labels_train_tensor)
    
    if epoch % 10 == 9:
        y_pred_val = model(data_val_tensor)
        
        loss_val = criterion(y_pred_val, labels_val_tensor)
        
        #Accuracy 
        output       = (y_pred > 0.5).float()
        correct      = (output == labels_train_tensor).float().sum()
        output_val   = (y_pred_val > 0.5).float()
        correct_val  = (output_val == labels_val_tensor).float().sum()
        accuracy_val = correct_val/labels_val_tensor.shape[0]
        
        if accuracy_val > minimum:
            minimum = accuracy_val
            torch.save({'state_dict':model.state_dict(), 'optimizer': optimizer.state_dict()}, path_model)          
            no_improve = 0
        else:
            no_improve += 1
        
        print("Epoch {}/{}, Loss: {:.3f}, Accuracy: {:.3f}, Loss_Val: {:.3f}, Accuracy_Val: {:.3f}".format(epoch+1,
                    num_epochs, loss, correct/labels_train_tensor.shape[0], loss_val, correct_val/labels_val_tensor.shape[0]))  

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # early stopping
    if no_improve > early_stopping_steps:
        break
        
checkpoint = torch.load(path_model)         

model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])

# to disactivate dropout layers
model.eval()

y_pred_val  = model(data_val_tensor)
loss_val    = criterion(y_pred_val, labels_val_tensor)
output_val  = (y_pred_val > 0.5).float()
correct_val = (output_val == labels_val_tensor).float().sum()
        
print("Final Model, Loss_Val: {:.3f}, Accuracy_Val: {:.3f}".format(loss_val, correct_val/labels_val_tensor.shape[0])) 

y_pred_test  = model(data_test_tensor)
loss_test    = criterion(y_pred_test, labels_test_tensor)
output_test  = (y_pred_test > 0.5).float()
correct_test = (output_test == labels_test_tensor).float().sum()
        
print("Final Model, Loss_Test: {:.3f}, Accuracy_Test: {:.3f}".format(loss_test, correct_test/labels_test_tensor.shape[0])) 

Epoch 10/10000, Loss: 1762.641, Accuracy: 0.492, Loss_Val: 319.628, Accuracy_Val: 0.550
Epoch 20/10000, Loss: 1576.793, Accuracy: 0.524, Loss_Val: 330.761, Accuracy_Val: 0.530
Epoch 30/10000, Loss: 1433.908, Accuracy: 0.558, Loss_Val: 289.170, Accuracy_Val: 0.589
Epoch 40/10000, Loss: 1306.000, Accuracy: 0.596, Loss_Val: 274.105, Accuracy_Val: 0.594
Epoch 50/10000, Loss: 1227.546, Accuracy: 0.635, Loss_Val: 271.142, Accuracy_Val: 0.584
Epoch 60/10000, Loss: 1172.211, Accuracy: 0.670, Loss_Val: 241.103, Accuracy_Val: 0.668
Epoch 70/10000, Loss: 1088.417, Accuracy: 0.710, Loss_Val: 242.251, Accuracy_Val: 0.650
Epoch 80/10000, Loss: 1030.778, Accuracy: 0.734, Loss_Val: 221.377, Accuracy_Val: 0.689
Epoch 90/10000, Loss: 957.441, Accuracy: 0.774, Loss_Val: 209.535, Accuracy_Val: 0.743
Epoch 100/10000, Loss: 894.053, Accuracy: 0.794, Loss_Val: 202.181, Accuracy_Val: 0.735
Epoch 110/10000, Loss: 837.953, Accuracy: 0.804, Loss_Val: 206.004, Accuracy_Val: 0.740
Epoch 120/10000, Loss: 785.890, A