In [1]:
import torch
import torchaudio as ta
import torch.nn as nn
import torch.optim as optim

import numpy as np
from sklearn.model_selection import train_test_split

ta.set_audio_backend('soundfile')

In [2]:
melify = ta.transforms.MelSpectrogram(n_mels = 64)

SEQSIZE = 800

In [3]:
def loadSound(index):
    if index < 3682:
        audio, _ = ta.load('VoxCeleb_gender/males/' + str(index) + '.flac')
    else:
        index -= 3682
        audio, _ = ta.load('VoxCeleb_gender/females/' + str(index) + '.flac')
    audio = melify(audio)
    return audio

In [4]:
def batching(size, isTrain):
    xb = torch.zeros(size, 2, 64, SEQSIZE)
    yb = torch.zeros(size, 1, dtype = torch.float32)
    bindices = []
    if isTrain:
        for i in range(size):
            bindices.append(xtr[np.random.randint(0, 4795)][0])
    else:
        for i in range(size):
            bindices.append(xte[np.random.randint(0, 1199)][0])
    for i in range(len(bindices)):
        sound = loadSound(int(bindices[i]))
        start = np.random.randint(0, sound.shape[-1] - SEQSIZE)
        xb[i] = sound[:, :, start:start + SEQSIZE]
        yb[i] = y[int(bindices[i])]
    return xb, yb

In [5]:
#24182 max
#953 min
#4795 tr 1199 te
y = torch.cat((torch.ones(3682, 1, dtype = torch.float32), torch.zeros(2312, 1, dtype = torch.float32)), dim = 0)
x = torch.tensor(np.arange(5994)).unsqueeze(-1).long().numpy()
xtr, xte, ytr, yte = train_test_split(x, y, test_size = 0.2)

In [6]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.network = nn.Sequential(
            nn.Conv2d(2, 8, kernel_size = 3),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size = 3),
            nn.ReLU(),
            nn.MaxPool2d(4),
            
            nn.Conv2d(16, 32, kernel_size = 3),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size = 3),
            nn.ReLU(),
            nn.MaxPool2d(4),
            
            nn.Flatten(),
            nn.Linear(6144, 1000),
            nn.ReLU(),
            nn.Linear(1000, 100),
            nn.ReLU(),
            nn.Linear(100, 10),
            nn.ReLU(),
            nn.Linear(10, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.network(x)
model = Model()
maxacc = 0

In [1]:
EPOCHS = 90
BATCH_SIZE = 32
LR = 0.0007
TEST_FREQ = 5
TEST_SIZE = 200

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = LR)


for epoch in range(EPOCHS):
    xbatch, ybatch = batching(BATCH_SIZE, True)
    preds = model(xbatch)
    loss = criterion(preds, ybatch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % TEST_FREQ == TEST_FREQ - 1:
        acc = 0
        with torch.no_grad():
            xbatch, ybatch = batching(TEST_SIZE, False)
            preds = torch.round(model(xbatch))
            ybatch = ybatch.squeeze()
            for i in range(TEST_SIZE):
                if preds[i] == ybatch[i]:
                    acc += 1
        print('Epoch: {} Loss: {} Acc: {}%'.format(epoch + 1, loss.item(), acc * 100.0 / TEST_SIZE))
        if acc > maxacc:
            torch.save(model.state_dict(), 'MVCBaseline.pt')
            maxacc = acc
            print('New Record')
acc = 0
model.load_state_dict(torch.load('MVCBaseline.pt'))
with torch.no_grad():
    xbatch, ybatch = batching(1000, False)
    preds = torch.round(model(xbatch))
    ybatch = ybatch.squeeze()
    for i in range(1000):
        if preds[i] == ybatch[i]:
            acc += 1
print('Final Val: {}%'.format(acc * 100.0 / 1000))


KeyboardInterrupt



In [75]:
torch.save(model.state_dict(), 'MVC89-8.pt') #89.8%

In [9]:
# 0 = feminine 1 = masculine
audio, _ = ta.load('insert filepath here')
print(audio.shape)
start = 100
audio = melify(audio)
MVC = Model()
MVC.load_state_dict(torch.load('MVC89-8.pt'))
audio = audio[:, :, start:start + SEQSIZE]
print(model(audio.unsqueeze(0)))

torch.Size([2, 441600])
tensor([[0.4717]], grad_fn=<SigmoidBackward0>)
