In [2]:
import os
import librosa
import numpy as np
import torch as th
import torch.nn.functional as F
import IPython.display as ipd
from collections import deque
import matplotlib.pyplot as plt
from src.utils import get_mel_image_from_float_normalized
AUDIO_FILE_PATH = os.path.join(os.getcwd(),"./data/Models/KeywordModel/Training/Sequences/Audio")
AUDIO_LABELS_PATH = os.path.join(os.getcwd(),"labels.txt")
n_mfcc = 28
max_len = 40

th.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x1cceae19cd0>

In [3]:
# Load in Labeled Data
def load_data():
    labels_dict = {}
    with open(AUDIO_LABELS_PATH, 'r') as file:
        for line in file:
            audio_file, label = line.strip().split('\t')
            if audio_file not in labels_dict:
                labels_dict[audio_file] = []
            labels_dict[audio_file].append(int(label))
    return labels_dict
labels = load_data()
labels

{'0.wav': [0, 0, 0, 0, 0, 0, 0],
 '1.wav': [0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 '10.wav': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 '100.wav': [1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0],
 '101.wav': [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 '102.wav': [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
 '103.wav': [1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  

In [4]:
#Load in Spliced Audio Data
def get_audio_sequences(audio_file_path):
    sequences_dict = {}
    dir_files = os.listdir(audio_file_path)
    for audio_file in dir_files:
        audio_sequence, sr = librosa.load(os.path.join(audio_file_path, audio_file), sr=None, mono=True)
        audio_sequence_length = librosa.get_duration(audio_sequence, sr=sr)
        audio_clip_length = 0.25
        audio_slices = []
        for i in range(0, int(audio_sequence_length/audio_clip_length)):
            audio_slices.append(audio_sequence[int(i*sr*audio_clip_length):int((i+1)*sr*audio_clip_length)])
        sequences_dict[audio_file] = (audio_slices, sr)
    return sequences_dict

slice_audio_sequences = get_audio_sequences(AUDIO_FILE_PATH)
slice_audio_sequences

  audio_sequence_length = librosa.get_duration(audio_sequence, sr=sr)
  0.00112915] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  audio_sequence_length = librosa.get_duration(audio_sequence, sr=sr)
  0.00018311] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  audio_sequence_length = librosa.get_duration(audio_sequence, sr=sr)
 -0.00741577] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  audio_sequence_length = librosa.get_duration(audio_sequence, sr=sr)
  audio_sequence_length = librosa.get_duration(audio_sequence, sr=sr)
 -0.03814697] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  audio_sequence_length = librosa.get_duration(audio_sequence, sr=sr)
  audio_sequence_length = librosa.get_duration(audio_sequence, sr=sr)
  0.18829346] as keyword args. From version 0.10 passing these as 

{'0.wav': ([array([ 0.00067139,  0.00085449,  0.00073242, ..., -0.00088501,
          -0.00088501, -0.00088501], dtype=float32),
   array([-0.00091553, -0.00085449, -0.0007019 , ..., -0.00222778,
          -0.00201416, -0.00186157], dtype=float32),
   array([-0.00180054, -0.00177002, -0.00180054, ...,  0.2539978 ,
           0.25585938,  0.25665283], dtype=float32),
   array([ 2.5582886e-01,  2.5363159e-01,  2.5122070e-01, ...,
          -3.6621094e-04, -2.4414062e-04, -1.2207031e-04], dtype=float32),
   array([ 0.        ,  0.00018311,  0.00024414, ..., -0.00109863,
          -0.00094604, -0.00073242], dtype=float32),
   array([-0.0005188 , -0.00015259,  0.00021362, ..., -0.00186157,
          -0.0017395 , -0.00167847], dtype=float32),
   array([-0.00183105, -0.00201416, -0.00210571, ..., -0.00598145,
          -0.00592041, -0.00585938], dtype=float32)],
  44100),
 '1.wav': ([array([-0.00332642,  0.0111084 ,  0.01870728, ...,  0.00100708,
           0.00091553,  0.00100708], dtype=flo

In [5]:
# Taking our Slices we must now compute the buffer images.
# The Research paper specifies that the buffers are sequential, and each snapshot of time the model sees is a deque of the time snippet of images,
#concatonated into a single image; t_1 -> [t_0, t_-1, t_-2,...t_-n] -> t_(-n-1)
def normalize_mfcc(mfcc, min_val=-1, max_val=1):
    mfcc_min, mfcc_max = mfcc.min(), mfcc.max()
    return ((mfcc - mfcc_min) * (max_val - min_val) / (mfcc_max - mfcc_min) + min_val).astype(np.float32)
       
def get_sequence_image_data(sequence_spliced_audio_data):
    image_data = {}
    for file, (audio_seq, sr) in sequence_spliced_audio_data.items():
        image_data[file] = []
        audio_deq = deque()
        for _ in range(4):
            audio_deq.append(np.zeros((1, n_mfcc, max_len)))
        for audio in audio_seq:
            audio_deq.popleft()
            image = np.array(get_mel_image_from_float_normalized(audio, sound_rate=sr)).reshape((1, n_mfcc, max_len))
            audio_deq.append(image)
            image_data[file].append(normalize_mfcc(np.hstack(audio_deq).copy()))
    return image_data

image_data = get_sequence_image_data(slice_audio_sequences)



In [6]:
plt.imshow(image_data[0], cmap='gray')

KeyError: 0

In [None]:
# convert files to indexes
def get_file_indexes(files):
    idx_dict = {}
    for i, f in enumerate(files):
        idx_dict[f] = int(f.split('.')[0])
    return idx_dict
idxs = get_file_indexes(os.listdir(AUDIO_FILE_PATH))

In [None]:
def convert_images_to_tensor(idxs, image_dict):
    tensor_data = [None] * len(idxs)
    for file, images in image_dict.items():
        tensor_data[idxs[file]] = []
        for image in images:
            tensor_data[idxs[file]].append(th.tensor(image))
        tensor_data[idxs[file]] = th.stack(tensor_data[idxs[file]])
    return tensor_data

In [None]:
def convert_labels_to_tensor(idxs, labels):
    label_data = [None] * len(idxs)
    for file, label in labels.items():
        label_data[idxs[file]] = F.one_hot(th.tensor(label), num_classes=2)
    return label_data

In [None]:
image_data_tnsr = convert_images_to_tensor(idxs, image_data)
image_label_tnsr = convert_labels_to_tensor(idxs, labels)

for i in range(len(image_data_tnsr)):
    if image_data_tnsr[i].shape[0] != image_label_tnsr[i].shape[0]:
        print(image_data_tnsr[i].shape)
        print(image_label_tnsr[i].shape)
        print('Error at index'+ str(i))
    continue

In [None]:
print(image_label_tnsr[2].shape)
print(image_data_tnsr[2].shape)

In [None]:
## Create the DataSet
from torch.utils.data import Dataset, random_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

class AudioDataset(Dataset):
    def __init__(self, data):
        self.data = sorted(data, key=lambda x: x[0].size(0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
def collate_fn(batch):
    # Separate data and labels from the batch
    data, labels = zip(*batch)

    padded_data = pad_sequence(data, batch_first=True, padding_value=0)
    max_seq_len = max([label.size(0) for label in labels])
    padded_outputs = []
    for label in labels:
        padding = max_seq_len - label.size(0)
        padding_tnsr = th.tensor([[1,0]] * padding)
        padding_lbls = th.cat([label, padding_tnsr], dim=0)
        padded_outputs.append(padding_lbls)
    return padded_data, th.stack(padded_outputs)

data = list(zip(image_data_tnsr, image_label_tnsr))
audio_dataset = AudioDataset(data)


# Calculate the split sizes
train_size = int(0.8 * len(audio_dataset))
validation_size = len(audio_dataset) - train_size

# Split the dataset into training and validation sets
train_dataset, validation_dataset = random_split(audio_dataset, [train_size, validation_size])

train_data_loader = DataLoader(train_dataset, batch_size=16, collate_fn=collate_fn)
validation_data_loader = DataLoader(validation_dataset, batch_size=16, collate_fn=collate_fn)

In [None]:
start_lr= 0.003
end_lr = 0.0001
factor = 0.999
def lr_schedule():
    global start_lr, end_lr, factor
    ret_lr = start_lr
    start_lr *= factor
    return ret_lr

In [None]:
# Training the Keyword Model!!
from torch.utils.data.dataset import Dataset, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
import os
def train_model(
    model, train_loader, val_loader,
    batch_size = 16, 
    epochs = 1000, 
    learning_rate = 5e-3, 
    log_interval = 50, 
    no_cuda = False, 
    seed = 1, 
    patience = 10):

  use_cuda = not no_cuda and th.cuda.is_available()
  device = th.device("cuda" if use_cuda else "cpu")
  print(device)
  kwargs = {}
  criterion = nn.CrossEntropyLoss()
  noise_level = 0.001  

  def train(model, device, train_loader, optimizer):
    model.train()
    model.to(device)
    total_loss = 0
    i = 0
    for data, target in train_loader:
        i+=1
        data = data.to(device).float()
        target = target.to(device)
        optimizer.zero_grad()
        output_target = target.float()
        output_prediction = model(data)
        _, target_indices = output_target.max(dim=2)
        loss = 0
        for t in range(output_prediction.size(1)):
            loss += criterion(output_prediction[:, t], target_indices[:, t]) * batch_size
        loss /= output_prediction.size(1)
        loss.backward()
        optimizer.step()
        lr_schedule()
        total_loss+=loss.item()
        if i % log_interval == 0:
            try:
                print(f'Avg Loss: {(total_loss/i+1)}%')
            except:
                pass
    return total_loss / len(train_loader.dataset)

  def validation(model, device, val_loader):
    model.eval()
    loss_total = 0
    with th.no_grad():
      for data, target in val_loader:
        data = data.to(device).float()
        target = target.to(device)
        output_target = target.float()
        output_prediction = model(data)
        _, target_indices = output_target.max(dim=2)
        val_loss = 0
        for t in range(output_prediction.size(1)):
            val_loss += criterion(output_prediction[:, t], target_indices[:, t]) * batch_size
        val_loss /= output_prediction.size(1)
        loss_total += val_loss.item()

    val_loss = loss_total / len(val_loader.dataset)
    print('Validation_loss:', val_loss)
    return val_loss

  model.to(device)

  optimizer = optim.Adam(model.parameters(), lr = start_lr,
        eps=1e-7,
        weight_decay=0.005,
        # momentum=0.92,
        # centered=True
    )
  print('Training...')
  for epoch in range(1, epochs+1):
    train_loss = train(model, device, train_loader, optimizer)
    if epoch % 10 == 0 :
        val_loss = validation(model, device, val_loader)
    if epoch % 50 == 0:
        model.save_checkpoint(os.path.join(os.getcwd(), 'data', 'Models', 'KeywordModel', 'Training', 'Checkpoints', f'KeywordCheckpoint_{epoch}.zip'))


In [None]:
# Train :D
from src.Gwen.AISystem.Networks import KeywordAudioModel
model = KeywordAudioModel()
# pytorch_total_params = sum (p.numel () for p in model.parameters ())
print(model(th.zeros((1, 1, 1, 112, 40), device='cuda'), ).size())

train_model(
    log_interval=3, learning_rate= 0.003, model=model, train_loader=train_data_loader, val_loader=validation_data_loader, epochs=1000,batch_size=32,
)

KeyboardInterrupt: 