In [9]:
import os
import numpy as np
from tqdm import tqdm
from pyAudioAnalysis import MidTermFeatures as aFm
from pyAudioAnalysis import audioBasicIO as aIO
import matplotlib.pyplot as plt
# import random

from pydub import AudioSegment
import moviepy.editor as mp

import warnings
warnings.filterwarnings('ignore')
from pydub import AudioSegment

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
REBUILD_DATA = False

In [4]:
class SirenVSChildren():
    SIRENS = "bipsmono"
    CHILDREN = "non_bipsmono"
    LABELS = {SIRENS : 0, CHILDREN : 1}
    training_data = []
    sirencount = 0
    childrencount = 0
    
    def make_training_data(self):
        for label in self.LABELS:
            print(label)
            for f in tqdm(os.listdir(label)):
                try:
                    path = os.path.join(label,f)
                    fs, s_ref = aIO.read_audio_file(path)
                    duration = len(s_ref) / float(fs)
                    win, step = 0.05, 0.05
                    win_mid, step_mid = duration, 10
                    mt_ref, st_ref, mt_n_ref = aFm.mid_feature_extraction(s_ref, fs, win_mid * fs, step_mid * fs, win * fs, step * fs)
                    self.training_data.append([np.array(mt_ref), np.eye(2)[self.LABELS[label]]])
                
                    if label == self.SIRENS:
                        self.sirencount += 1
                
                    elif label == self.CHILDREN:
                        self.childrencount += 1
                except Exception as e:
                    pass
                
        np.random.shuffle(self.training_data)
        np.save("training_data_1D.npy", self.training_data)
        print("nb 1 =",self.sirencount)
        print("nb 2 =",self.childrencount)
        

In [5]:
if REBUILD_DATA:
    sirenvchildren = SirenVSChildren()
    sirenvchildren.make_training_data()
    

In [14]:
training_data = np.load("training_data_1D_bip.npy", allow_pickle=True)

FileNotFoundError: [Errno 2] No such file or directory: 'training_data_1D_bip.npy'

In [7]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 24, 5)
        self.conv2 = nn.Conv1d(24, 48, 5)        
        self.conv3 = nn.Conv1d(48, 48, 5)

        self.fc1 = nn.Linear(1296, 64)
        self.fc2 = nn.Linear(64, 2)        
        

    
    def forward(self,x):
        x = F.max_pool1d(F.relu(self.conv1(x)), (2))
        x = F.max_pool1d(F.relu(self.conv2(x)), (2))
        x = F.relu(self.conv3(x))
        x = x.view(-1,1296)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

In [8]:
net = Net()

In [10]:
optimizer = optim.Adam(net.parameters(), lr = 0.001)
loss_function = nn.MSELoss()

In [13]:
X = torch.Tensor([i[0] for i in training_data]).view(-1, 136)
y = torch.Tensor([i[1] for i in training_data])

NameError: name 'training_data' is not defined

In [15]:
VAL_PCT = 0.2
val_size = int(len(X)*VAL_PCT)

NameError: name 'X' is not defined

In [None]:
 train_X = X[:-val_size]
train_y = y[:-val_size]

test_X = X[-val_size:]
test_y = y[-val_size:]

In [None]:
BATCH_SIZE = 20
EPOCHS = 100

In [None]:
total_loss = []
total_accuracy = []
for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(train_X), BATCH_SIZE)):
        batch_X = train_X[i:i+BATCH_SIZE].view(-1,1,136)
        for j in range(len(batch_X)):
            batch_X[j] = batch_X[j]
        batch_y = train_y[i:i+BATCH_SIZE]
        
        net.zero_grad()
        outputs = net(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()
    total_loss.append(loss.item())
    correct = 0
    total = 0
    with torch.no_grad():
        for i in tqdm(range(len(test_X))):
            real_class = torch.argmax(test_y[i])
            net_out = net(test_X[i].view(-1,1,136))[0]
            predicted_class = torch.argmax(net_out)
            if predicted_class == real_class:
                correct +=1
            total +=1
    total_accuracy.append(round(correct/total,3))

In [None]:
def audio_classification(audio_path):
    fs, s_ref = aIO.read_audio_file(audio_path)
    duration = len(s_ref) / float(fs)
    win, step = 0.05, 0.05
    win_mid, step_mid = duration, 10
    mt_ref, st_ref, mt_n_ref = aFm.mid_feature_extraction(s_ref, fs, win_mid * fs, step_mid * fs, win * fs, step * fs)
    with torch.no_grad():
        net_out = net(torch.Tensor(mt_ref).view(-1,1,136))
    return net_out


In [None]:
def start_detection(video_path):
    clip = mp.VideoFileClip(video_path)
    duration = clip.duration
    start_time = []
    for i in range (int(duration*4)-1):
        subclip = clip.subclip(i/4,i/4+0.5)
        fs = 44100
        s_long = subclip.audio.to_soundarray(fps=fs)
        s_long = s_long[:, 0]
        duration_long = len(s_long) / float(fs)
        win, step = 0.05, 0.05
        win_mid, step_mid = duration_long, 0.5
        mt_long, st_long, mt_n_long = aFm.mid_feature_extraction(s_long, fs, win_mid * fs, step_mid * fs,
                                                             win * fs, step * fs)
        with torch.no_grad():
            net_out = net(torch.Tensor(mt_long).view(-1,1,136))
        if torch.argmax(net_out) == 0:
            start_time.append(i/4)
    return start_time