# Import Requiremets

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchaudio
import torchvision.transforms as transforms
import os
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import torchvision.models as models
import torch.nn.functional as F
import time
import librosa
from skopt import gp_minimize
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args
from torch.quantization import QuantStub, DeQuantStub

# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import torch
import torch.nn as nn
from collections import defaultdict


class Add(nn.Module):
    '''
    Adds two tensors and returns the result
    '''
    def __init__(self,activation=None):
        super(Add, self).__init__()
        self.activation = activation
        self.digital = True
        
    def forward(self, x):
        if len(x) != 2:
            print('ERR: Num tensors to add',len(x))
            raise
#         return torch.stack(x,dim=0).sum(dim=0)
        if self.activation is not None:
            return self.activation(torch.stack(x,dim=0).sum(dim=0))
        else:
            return torch.stack(x,dim=0).sum(dim=0)
        
def model_summary(M, pt_191=False):
    """
    This function provides summary of all the named classes in the model.
    Use arguments pt_191=True for pytorch 1.9.1 usage, default pt_191 = False
    Returns a dictionary of class names and usage count.
    """
    def zero(): return 0
    cdict = defaultdict(zero)
    

    for n,m in M.named_modules(remove_duplicate=True):
        if isinstance(m,nn.Conv2d):
            if M.get_submodule(n.rsplit('.',1)[0]).__class__.__name__ == 'CART':
                cdict['CART_'+m.__class__.__name__]+=1
                
            else:
                cdict[m.__class__.__name__]+=1
                
            
        elif isinstance(m,(nn.ReLU,Add)) and hasattr(m,'digital'):
            if m.digital:
                cdict[m.__class__.__name__]+=1
                
            else:
                cdict['CART_'+m.__class__.__name__]+=1
                
        else:
             cdict[m.__class__.__name__]+=1
        
            
    w_size=0        
    for p in M.parameters():
        w_size+=p.shape.numel()
    cdict['Parameters'] = str(w_size/1e6)+'M'   
        
    return dict(cdict)

# Class AudioDataset

In [None]:
class AudioDataset(Dataset):
    def __init__(self, directory, desired_duration, sample_rate=44100):
        self.directory = directory
        self.classes = sorted(os.listdir(directory))
        self.audio_files = []
        self.desired_duration = desired_duration
        self.sample_rate=sample_rate

        for i, class_name in enumerate(self.classes):
            class_path = os.path.join(directory, class_name)
            for audio_file in os.listdir(class_path):
                self.audio_files.append((os.path.join(class_path, audio_file), i))

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_file, label = self.audio_files[idx]
        # print(f"Loading audio file: {audio_file}")
        # waveform, sample_rate = torchaudio.load(audio_file)
        waveform, sample_rate = librosa.load(audio_file, sr=None) 
        # print(f"Loaded waveform shape: {waveform.shape}, Sample rate: {sample_rate}")
        waveform = self._process_waveform(waveform)
        waveform = torch.tensor(waveform)
        # label = F.one_hot(torch.tensor(label), num_classes=len(self.classes))
        return waveform, label
    
    def _process_waveform(self, waveform):
        if len(waveform) != self.desired_duration * self.sample_rate:
            waveform = librosa.resample(waveform, orig_sr=len(waveform), target_sr=self.sample_rate)

        if len(waveform) < self.desired_duration * self.sample_rate:
            # print("Padding waveform...")
            pad_size = self.desired_duration * self.sample_rate - len(waveform)
            waveform = torch.tensor(waveform).unsqueeze(0)  # Convert to torch tensor
            waveform = torch.nn.functional.pad(waveform, (0, pad_size)).squeeze(0)  # Pad and remove the added dimension
        elif len(waveform) > self.desired_duration * self.sample_rate:
            # print("Truncating waveform...")
            waveform = waveform[:self.desired_duration * self.sample_rate]

        return waveform

## Define data directories

In [None]:
train_dir = 'data/train'
validation_dir = 'data/validate'
test_dir = 'data/test'

## Load datasets

In [None]:
desired_duration = 6  # Duration in seconds
train_dataset = AudioDataset(train_dir, desired_duration=desired_duration)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

validation_dataset = AudioDataset(validation_dir,desired_duration=desired_duration)
validation_loader = DataLoader(validation_dataset, batch_size=8, shuffle=False)

test_dataset = AudioDataset(test_dir, desired_duration=desired_duration)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)


# Class ElephantCallerNet

In [None]:
def get_tfeb_pool_size_component(length):
    # print(length);
    c = [];
    index = 1;
    while index <= 6:
        if length >= 2:
            if index == 6:
                c.append(length);
            else:
                c.append(2);
                length = length // 2;
        else:
           c.append(1);
        index += 1;
    return c

def get_tfeb_pool_sizes(conv2_ch, width):
    h = get_tfeb_pool_size_component(conv2_ch);
    w = get_tfeb_pool_size_component(width);
    # print(w);
    pool_size = [];
    for  (h1, w1) in zip(h, w):
        pool_size.append((h1, w1));
    return pool_size

class ElephantCallerNet(nn.Module):
    def __init__(self, input_length, n_class, sr, k_size, ch_conf=None, quantize=False):
        super(ElephantCallerNet, self).__init__()
        self.input_length = input_length
        self.ch_config = ch_conf
        self.quantize = quantize

        stride1 = 2
        stride2 = 2
        channels = 8
        k_size = k_size #(3, 3)
        n_frames = (sr / 1000) * 10  # No of frames per 10ms

        sfeb_pool_size = int(n_frames / (stride1 * stride2))
        if self.ch_config is None:
            self.ch_config = [channels, channels * 8, channels * 4, channels * 8, channels * 8,
                              channels * 16, channels * 16, channels * 32, channels * 32, channels * 64,
                              channels * 64, n_class]

        fcn_no_of_inputs = self.ch_config[-1]

        self.conv1, self.bn1 = self.make_layers(1, self.ch_config[0], (1, 9), (1, stride1))
        self.conv2, self.bn2 = self.make_layers(self.ch_config[0], self.ch_config[1], (1, 5), (1, stride2))
        self.conv3, self.bn3 = self.make_layers(1, self.ch_config[2], k_size, padding=1)
        self.conv4, self.bn4 = self.make_layers(self.ch_config[2], self.ch_config[3], k_size, padding=1)
        self.conv5, self.bn5 = self.make_layers(self.ch_config[3], self.ch_config[4], k_size, padding=1)
        self.conv6, self.bn6 = self.make_layers(self.ch_config[4], self.ch_config[5], k_size, padding=1)
        self.conv7, self.bn7 = self.make_layers(self.ch_config[5], self.ch_config[6], k_size, padding=1)
        self.conv8, self.bn8 = self.make_layers(self.ch_config[6], self.ch_config[7], k_size, padding=1)
        self.conv9, self.bn9 = self.make_layers(self.ch_config[7], self.ch_config[8], k_size, padding=1)
        self.conv10, self.bn10 = self.make_layers(self.ch_config[8], self.ch_config[9], k_size, padding=1)
        self.conv11, self.bn11 = self.make_layers(self.ch_config[9], self.ch_config[10], k_size, padding=1)
        self.conv12, self.bn12 = self.make_layers(self.ch_config[10], self.ch_config[11], (1, 1))

        self.fcn = nn.Linear(fcn_no_of_inputs, n_class)
        nn.init.kaiming_normal_(self.fcn.weight, nonlinearity='sigmoid')

        self.sfeb = nn.Sequential(
            self.conv1, self.bn1, nn.ReLU(),
            self.conv2, self.bn2, nn.ReLU(),
            nn.MaxPool2d(kernel_size=(1, sfeb_pool_size))
        )

        tfeb_modules = []
        self.tfeb_width = int(((self.input_length / sr) * 1000) / 10)  # 10ms frames of audio length in seconds
        tfeb_pool_sizes = get_tfeb_pool_sizes(self.ch_config[1], self.tfeb_width)
        p_index = 0
        for i in [3, 4, 6, 8, 10]:
            tfeb_modules.extend([eval('self.conv{}'.format(i)), eval('self.bn{}'.format(i)), nn.ReLU()])

            if i != 3:
                tfeb_modules.extend([eval('self.conv{}'.format(i + 1)), eval('self.bn{}'.format(i + 1)), nn.ReLU()])

            h, w = tfeb_pool_sizes[p_index]
            if h > 1 or w > 1:
                tfeb_modules.append(nn.MaxPool2d(kernel_size=(max(1,h), max(1,w))))
            p_index += 1

        tfeb_modules.append(nn.Dropout(0.2))
        tfeb_modules.extend([self.conv12, self.bn12, nn.ReLU()])
        h, w = tfeb_pool_sizes[-1]
        if h > 1 or w > 1:
            tfeb_modules.append(nn.AvgPool2d(kernel_size=(max(1,h), max(1,w))))
        tfeb_modules.extend([nn.Flatten(), self.fcn])

        self.tfeb = nn.Sequential(*tfeb_modules)

        self.output = nn.Sequential(
            nn.Softmax(dim=1)
        )

        if self.quantize:
            self.quant = QuantStub()
            self.dequant = DeQuantStub()

    def forward(self, x):
        x = x.unsqueeze(0).unsqueeze(0)
        if self.quantize:
            x = self.quant(x)

        x = self.sfeb(x)
        x = x.permute((2, 0, 1, 3))
        x = self.tfeb(x)

        if self.quantize:
            x = self.dequant(x)
        y = self.output(x)
        y = F.softmax(y, dim=1)
        return y

    def make_layers(self, in_channels, out_channels, kernel_size, stride=(1, 1), padding=0, bias=False):
        conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
                         padding=padding, bias=bias)
        nn.init.kaiming_normal_(conv.weight, nonlinearity='relu')
        bn = nn.BatchNorm2d(out_channels)
        return conv, bn

In [None]:
# Function to calculate the number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Train Function

In [None]:
def train_model(model, train_loader,val_loader, criterion, optimizer, num_epochs=10):
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []
    
    start_time = time.time()

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = correct / total
        train_loss_history.append(epoch_train_loss)
        train_acc_history.append(epoch_train_acc)

        print(f"Train Loss: {epoch_train_loss:.4f}, Train Accuracy: {epoch_train_acc:.4f}")

        # Validation
        model.eval()
        val_running_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for val_inputs, val_labels in val_loader:
                val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
                val_outputs = model(val_inputs)
                val_loss = criterion(val_outputs, val_labels)
                val_running_loss += val_loss.item() * val_inputs.size(0)
                _, val_predicted = torch.max(val_outputs, 1)
                val_total += val_labels.size(0)
                val_correct += (val_predicted == val_labels).sum().item()

        epoch_val_loss = val_running_loss / len(val_loader.dataset)
        epoch_val_acc = val_correct / val_total
        val_loss_history.append(epoch_val_loss)
        val_acc_history.append(epoch_val_acc)

        print(f"Validation Loss: {epoch_val_loss:.4f}, Validation Accuracy: {epoch_val_acc:.4f}")

    end_time = time.time()  # Record end time
    training_time = end_time - start_time
    print(f"Training Time: {training_time:.2f} seconds")

    # Plotting
    plt.figure(figsize=(10, 5))
    plt.subplot(2, 2, 1)
    plt.plot(range(1, num_epochs + 1), train_loss_history, label='Train Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training Loss')

    plt.figure(figsize=(10, 5))
    plt.subplot(2, 2, 2)
    plt.plot(range(1, num_epochs + 1), val_loss_history, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Validation Loss')

    plt.subplot(2, 2, 3)
    plt.plot(range(1, num_epochs + 1), train_acc_history, label='Train Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training Accuracy')

    plt.subplot(2, 2, 4)
    plt.plot(range(1, num_epochs + 1), val_acc_history, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Validation Accuracy')

    plt.show()

    # Calculate model size and number of parameters
    model_size_mb = sum(p.numel() for p in model.parameters()) / (1024 * 1024)
    num_parameters = count_parameters(model)
    print(f"Model Size: {model_size_mb:.2f} MB")
    print(f"Number of Parameters: {num_parameters}")

    # Save the trained model
    torch.save(model.state_dict(), 'adc_optim.pth')
    torch.save(model, "adc_optim.pt")

    return model

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    y_true = []
    y_pred = []
    inference_start_time = time.time()

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    inference_end_time = time.time()
    inference_time = inference_end_time - inference_start_time
    print(f"Inference Time: {inference_time:.4f} seconds")
    
    test_accuracy = correct / total
    print('Test Accuracy:', test_accuracy)

    # Confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    # Classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=test_dataset.classes))

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=test_dataset.classes, yticklabels=test_dataset.classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix (Test)')
    plt.show()

    # Plot classification report
    plt.figure(figsize=(8, 6))
    sns.heatmap(pd.DataFrame.from_dict(classification_report(y_true, y_pred, target_names=test_dataset.classes, output_dict=True)), annot=True, cmap='Blues')
    plt.xlabel('Metrics')
    plt.ylabel('Classes')
    plt.title('Classification Report (Test)')
    plt.show()

# Bayesian optimization

In [None]:
def validate_model(model, val_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

In [None]:
# Define hyperparameter space for Bayesian optimization
# search_space = [Real(1e-6, 1e-2, name='learning_rate')]
search_space = [
    Real(1e-6, 1e-2, name='learning_rate'),
    Integer(3, 7, name='kernel_size'),
]
sample_rate= 44100

results_file = "adc_optim.txt"

# Perform Bayesian optimization
@use_named_args(search_space)
def optimize_model(learning_rate, kernel_size):
    # Define model architecture and other necessary components
    model = ElephantCallerNet(n_class=len(train_dataset.classes),
                                input_length=desired_duration*sample_rate, 
                                k_size=kernel_size, sr=sample_rate).to(device)
    
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Train the model
    trained_model = train_model(model, train_loader, validation_loader, criterion, optimizer, num_epochs=50)
    val_accuracy = validate_model(trained_model, validation_loader)

    print(f"Learning Rate: {learning_rate},kernel_size:{kernel_size}, Validation Accuracy: {val_accuracy}")

    with open(results_file, 'a') as f:
        f.write(f"Learning Rate: {learning_rate},kernel_size: {kernel_size}, Validation Accuracy: {val_accuracy}\n")
    
    
    # Return the validation accuracy as the optimization target
    return -val_accuracy 

res_gp = gp_minimize(optimize_model, search_space, n_calls=30, random_state=42)

# Get best hyperparameters
# best_params = dict(zip(['learning_rate'], res_gp.x))
# print("Best hyperparameters:", best_params)
best_params = dict(zip(['learning_rate', 'kernel_size'], res_gp.x))
print("Best hyperparameters:", best_params)

# Train model with best hyperparameters
best_accuracy = -res_gp.fun
print("Best accuracy:", best_accuracy)

# Create the model

In [None]:
sample_rate= 44100

In [None]:
model = ElephantCallerNet(input_length=desired_duration*sample_rate, n_class=3, sr=sample_rate, quantize=False)
model

In [None]:
# model = MobileNetV2RawAudio(num_classes=len(train_dataset.classes),num_samples=desired_duration*sample_rate, dropout_rate = 0.4 ).to(device)
# print(model)

In [None]:
model_summary(model)

# Initialize model, criterion, and optimizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005520881241955914)


# Train the model

In [None]:
train_model(model.cuda(), train_loader, validation_loader, criterion, optimizer, num_epochs=50)

# Evaluate the model

In [None]:
evaluate_model(model, test_loader)