In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
from scipy.io import wavfile
from scipy import signal
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim


In [2]:
class CustomDataset(Dataset):
    def __init__(self, df, input_time_steps,right_trim_time_steps):
        self.df = df
        self.input_time_steps = input_time_steps
        self.right_trim_time_steps = right_trim_time_steps
        
    def __len__(self):
        return len(self.df) #works
    
    def __getitem__(self, index):
        file_name = self.df.loc[index, 'file_name']
        label = self.df.loc[index, 'label'] #this is binary right now.  
        sample_rate, samples = wavfile.read(file_name)
        _, _, spectrogram = signal.spectrogram(samples, sample_rate)
        spectrogram = spectrogram[:,:-self.right_trim_time_steps] #this is how many to shave off the end. 
        data = torch.from_numpy(spectrogram[:,-self.input_time_steps:]) #the value for time steps is the last x.  20 time steps means last 20
        return data.unsqueeze(0), label
    
def get_dataloader(df, batch_size,input_time_steps,right_trim_time_steps):
    dataset = CustomDataset(df,input_time_steps,right_trim_time_steps)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

In [6]:
import torch
import torch.nn as nn

class AudioClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)

        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)

        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)

        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)

        # Linear classifier
        self.avg_pool = nn.AdaptiveAvgPool2d(output_size = 1)
        self.fc = nn.Linear(64, 1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.bn1(x)

        x = self.conv2(x)
        x = self.relu2(x)
        x = self.bn2(x)

        x = self.conv3(x)
        x = self.relu3(x)
        x = self.bn3(x)

        x = self.conv4(x)
        x = self.relu4(x)
        x = self.bn4(x)

        x = self.avg_pool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc(x)

        return x


In [15]:
import torch
   



epochs =  10
batch_size = 2
lr = 0.001
momentum = 0.9
log_interval = 10
weight_decay = 0.0001

save_model_dir ='models'
save_model_name = 'model.pt'

input_time_steps = 100 
right_trim_time_steps = 1






#parser.add_argument('--save_model_interval', type=int, default=10, help='Save model interval')

    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
   

In [54]:
#simple rnn Architecture for wake work detection

import torch
import torch.nn as nn
import torch.nn.functional as F


class RNN(nn.Module):
    def __init__(self, input_size, dropout_rate=0.5):
        super().__init__()
        self.rnn = nn.LSTM(input_size, 256, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64,1)
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.layer_norm = nn.LayerNorm(input_size)

    def forward(self, x):
        x = x.squeeze(1)
        print(x.shape)
        x = self.layer_norm(x)
        x, (hidden_last, cell_last) = self.rnn(x)
        hidden_last = self.dropout(hidden_last)

        hidden_last = hidden_last.squeeze(0)
        x = self.classifier(hidden_last)
        return x

In [56]:
file_list = ['en-US.wav','en-US.wav','en-US.wav','en-US.wav']
file_id = [0,1,2,3] #corresponds to the file list id.  this is redundancy babay
file_label = [True,True, False, True] #ditto
df_loader = pd.DataFrame(list(zip(file_list,file_label)), index = file_id, columns = ['file_name','label'])

trainloader = get_dataloader(df_loader, batch_size, input_time_steps, right_trim_time_steps)

input_size = 100
model = RNN(input_size).to(device)

In [61]:
model

RNN(
  (rnn): LSTM(100, 256, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): LeakyReLU(negative_slope=0.01)
    (5): Linear(in_features=64, out_features=1, bias=True)
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (layer_norm): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
)

In [57]:


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr= lr, weight_decay= weight_decay)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

for epoch in range(epochs):
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for i, data in enumerate(trainloader, 0):
        # get the inputs and labels and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # ANY Prepocessing of input and labels goes here
        labels = labels.float().unsqueeze(1)

        output = model(inputs)
        print(output, labels.shape)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()


(129, 168)
(129, 168)
torch.Size([2, 129, 100])
tensor([[0.0106],
        [0.0198]], grad_fn=<AddmmBackward0>) torch.Size([2, 1])
(129, 168)
(129, 168)
torch.Size([2, 129, 100])
tensor([[0.0244],
        [0.0358]], grad_fn=<AddmmBackward0>) torch.Size([2, 1])
(129, 168)
(129, 168)
torch.Size([2, 129, 100])
tensor([[0.0210],
        [0.0365]], grad_fn=<AddmmBackward0>) torch.Size([2, 1])
(129, 168)
(129, 168)
torch.Size([2, 129, 100])
tensor([[0.0904],
        [0.0750]], grad_fn=<AddmmBackward0>) torch.Size([2, 1])
(129, 168)
(129, 168)
torch.Size([2, 129, 100])
tensor([[0.0626],
        [0.2043]], grad_fn=<AddmmBackward0>) torch.Size([2, 1])
(129, 168)
(129, 168)
torch.Size([2, 129, 100])
tensor([[0.0959],
        [0.1211]], grad_fn=<AddmmBackward0>) torch.Size([2, 1])
(129, 168)
(129, 168)
torch.Size([2, 129, 100])
tensor([[0.1542],
        [0.1910]], grad_fn=<AddmmBackward0>) torch.Size([2, 1])
(129, 168)
(129, 168)
torch.Size([2, 129, 100])
tensor([[0.2184],
        [0.1618]], grad_