In [None]:

train_prefix="multiclass_resnet-indep_"

train_folder="./train-val-independent/train/"
valid_folder="./train-val-independent/validation/"
train_csv = "./csv_resnet_indep/trainData.csv"
valid_csv = "./csv_resnet_indep/valiData.csv"

b_selfattention=True

batch_size = 4
learning_rate = 1e-4

batch_size = 16
learning_rate = 1e-3

epochs = 500

start_model_path = None


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import time
import os



# neural network

paper: "Acoustic Identification of Ae. aegypti Mosquitoes using Smartphone Apps and Residual Convolutional Neural Networks"


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=(3, 3)):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size, padding=1)
        self.norm1 = nn.BatchNorm2d(out_channels)  # Updated to BatchNorm2d for spatial inputs
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size, padding=1)
        self.norm2 = nn.BatchNorm2d(out_channels)  # Updated to BatchNorm2d for spatial inputs

    def forward(self, x):
        residual = x
        x = F.relu(self.norm1(self.conv1(x)))
        x = self.norm2(self.conv2(x))
        return F.relu(x + residual)

class ClassificationBlock(nn.Module):
    def __init__(self, in_features, cls_num):
        super(ClassificationBlock, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(in_features, 256)
        self.fc2 = nn.Linear(256, cls_num)
        self.cls_num=cls_num

    def forward(self, x):
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class ProposedModel(nn.Module):
    def __init__(self, input_channels, num_residual_blocks=3, base_filters=64, kernel_size=(3, 3), cls_num=2):
        super(ProposedModel, self).__init__()
        self.initial_conv = nn.Conv2d(input_channels, base_filters, kernel_size=kernel_size, padding=1)
        self.arrangements = nn.ModuleList()
        for _ in range(num_residual_blocks):
            self.arrangements.append(
                nn.Sequential(
                    ResidualBlock(base_filters, base_filters),
                    nn.MaxPool2d((2, 2)),
                    nn.Dropout(0.2)
                )
            )
        self.final_pool = nn.AdaptiveAvgPool2d((1, 1))  # Adaptive pooling to ensure consistent feature size
        self.classification_block = ClassificationBlock(base_filters, cls_num)

    def forward(self, x):
        x = F.relu(self.initial_conv(x))
        for arrangement in self.arrangements:
            x = arrangement(x)
        x = self.final_pool(x)  # Ensure output size is (N, base_filters, 1, 1)
        x = self.classification_block(x)
        return x



# data - spectrogram

In [None]:
import pandas as pd
import librosa
import numpy as np
import librosa.display
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import random


# normalize the spectrogram
def spec_normalization(spec, err=1e-6):
    mean, std = spec.mean(), spec.std()
    spec = (spec - mean) / (std + err)
    return spec


# transfer from 2 channels spectrogram to 3 channels image
def spec_img(spec):
    spec = spec_normalization(spec)
    spec_min, spec_max = spec.min(), spec.max()
    spec = 255 * (spec - spec_min) / (spec_max - spec_min)
    spec = spec.astype(np.uint8)
    spec = spec[np.newaxis, ...]
    #print(spec.shape)
    return spec


# data augmentation on time domain and frequency domain in spectrogram image - have 3 channels
def specaug(mel_spectrogram, frequency_masking_para=1,
            time_masking_para=1, frequency_mask_num=1, time_mask_num=1):
    """
        Modified from SpecAugment
        Author: Demis TaeKyu Eom and Evangelos Kazakos
        License: https://github.com/DemisEom/SpecAugment/blob/master/LICENSE
        Code URL: https://github.com/DemisEom/SpecAugment/blob/master/SpecAugment/spec_augment_pytorch.py
    """
    v = mel_spectrogram.shape[1]
    tau = mel_spectrogram.shape[2]
    # Frequency masking
    for i in range(frequency_mask_num):
        f = np.random.uniform(low=0.0, high=frequency_masking_para)
        f = int(f)
        f0 = random.randint(0, v - f)
        mel_spectrogram[:, f0:f0 + f, :] = 0

    # Time masking
    for i in range(time_mask_num):
        t = np.random.uniform(low=0.0, high=time_masking_para)
        t = int(t)
        t0 = random.randint(0, tau - t)
        mel_spectrogram[:, :, t0:t0 + t] = 0
    return mel_spectrogram


# Store data into Dataset
class ListDataset(Dataset):
    def __init__(self, label_file, label_list, d_type=None):
        self.label_file = pd.read_csv(label_file)
        self.label_list = label_list
        self.transform = transforms.ToTensor()
        self.d_type = d_type
    
        self.specs = []
        self.labels = []
        self.fns = []

        for i in range(len(self.label_file)):
            audio_path = self.label_file.iloc[i]['Fname']
            self.fns.append(audio_path)
           
            spec = get_melspec(audio_path)
            spec = spec_img(spec)

            self.specs.append(spec)

            # get its label
            label_class = self.label_file.iloc[i]['Species']
            #print(label_class)
            label = torch.tensor(self.label_list[label_class])
            self.labels.append(label)

    def __getitem__(self, index):
        # data augumentation
        cur_spec = self.specs[index]
        if self.d_type == "train":
            cur_spec = specaug(cur_spec)

        return cur_spec, self.labels[index], self.fns[index]

    def __len__(self):
        #         length of the whole dataset
        return len(self.labels)

    
# transform data from raw audio to spectrogram
def get_melspec(file_path, sr=8000, top_db=80):
    #print(file_path)
    wav, sr = librosa.load(file_path, sr=sr)
    
    wav = np.pad(wav, int(np.ceil((2 * sr - wav.shape[0]) / 2)), mode='reflect')
 
    spec = librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=256, hop_length=64)
   
    spec = librosa.power_to_db(spec, top_db=top_db)
    return spec






# training

In [None]:
# create csv-s for classification folders
# Fname,Genera,Species

import os
import csv

# create folders if not exist
os.makedirs(os.path.dirname(train_csv), exist_ok=True)
os.makedirs(os.path.dirname(valid_csv), exist_ok=True)


def generate_csv(folder_path, output_csv):
    """
    generate csv from paths.
    """
    data = []
    label_list=set()

    # for all subfolders and files
    for species in os.listdir(folder_path):
        label_list.add(species)
        
        species_path = os.path.join(folder_path, species)
        if os.path.isdir(species_path):
            for fname in os.listdir(species_path):
                # add files with their data
                if os.path.isfile(os.path.join(species_path, fname)):
                    data.append({
                        "Fname": os.path.join(species_path, fname).replace("\\", "/"),
                        "Genera": "",
                        "Species": species
                    })

    # write out CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["Fname", "Genera", "Species"])
        writer.writeheader()
        writer.writerows(data)

    return {key: idx for idx, key in enumerate(label_list)}

# Train CSV
label_list=generate_csv(train_folder, train_csv)
print(f"Train CSV is created: {train_csv}")

# Validation CSV
generate_csv(valid_folder, valid_csv)
print(f"Validation CSV is created: {valid_csv}")

print(label_list)


In [None]:
def train(model, device, train_loader, valid_loader, epochs, lf, optimizer):
    sched = optim.lr_scheduler.CosineAnnealingLR(optimizer, 200, 0)
    best_valid_acc = 0
    best_model_report = ''

    output_file = open(train_prefix+'output_big.txt', 'w+')
    output_file.write('start...')
    output_file.write('\n')
    output_file.close()
    print('start...')
    # training procedure
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        model.train()
        loss_val = 0
        true_running = 0
        total_running = 0
        for i, data in enumerate(train_loader):
            #print(data)
            x, gt = data[0].to(device, dtype=torch.float32), data[1].to(device, dtype=torch.long)
            optimizer.zero_grad()
            predicted = model(x)
            loss = lf(predicted, gt)

            result, predicted_class = torch.max(predicted, 1)
            true_running += (predicted_class == gt).sum()
            total_running += predicted_class.shape[0]

            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        train_loss = loss_val / len(train_loader)
        accuracy = torch.true_divide(true_running, total_running)
        print(f'Epoch - {epoch} Train - Loss : {train_loss} Accuracy : {accuracy}')
        output_file = open(train_prefix+'output_big.txt', 'a')
        output_file.write(f'Epoch {epoch}/{epochs} - Train')
        output_file.write(f'loss: {train_loss}')
        output_file.write('\n')
        output_file.write(f'accuracy: {accuracy}')
        output_file.write('\n')
        output_file.close()

        sched.step()
        model.eval()

        # validating procedure
        valid_loss_val = 0
        valid_true_running = 0
        valid_total_running = 0
        y_pred = np.array([])
        y_test = np.array([])
        for i, data in enumerate(valid_loader):
            x, gt = data[0].to(device, dtype=torch.float32), data[1].to(device, dtype=torch.long)
            predicted = model(x)
            loss = lf(predicted, gt)

            result, predicted_class = torch.max(predicted, 1)
            valid_true_running += (predicted_class == gt).sum()
            valid_total_running += predicted_class.shape[0]

            valid_loss_val += loss.item()

            y_pred = np.append(y_pred, predicted_class.cpu().detach().numpy())
            y_test = np.append(y_test, gt.cpu().detach().numpy())

        # calculating measurements
        valid_loss = valid_loss_val / len(train_loader)
        accuracy = torch.true_divide(valid_true_running, valid_total_running)
        print(f'Epoch - {epoch} Validation - Loss : {valid_loss} Accuracy : {accuracy}')

        # accuracy and loss
        output_file = open(train_prefix+'output_big.txt', 'a')
        output_file.write(f'Epoch {epoch}/{epochs} - Validation')
        output_file.write(f'loss: {valid_loss}')
        output_file.write('\n')
        output_file.write(f'accuracy: {accuracy}')
        output_file.write('\n')

        # precision, recall, f1-score
        output_file.write('\nClassification Report\n')
        output_file.write(classification_report(y_test, y_pred, zero_division=0))
        output_file.write('\n')

        # confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        output_file.write('\nConfusion Matrix\n')
        output_file.write(str(conf_matrix))
        output_file.write('\n')

        # time usage for each epoch
        end_time = time.time()
        usage_time = end_time - start_time
        output_file.write(f'Time usage: {usage_time} secs')
        output_file.write('\n')
        output_file.write('\n')

        output_file.close()

        # save best model and its performance report, can be used for futher training
        
        # Save best loss model
        if epoch == 1 or valid_loss < best_loss_val:
            best_loss_val = valid_loss
            torch.save(model.state_dict(), f'./{train_prefix}checkpoints/best_loss_{epoch}.pth')

        # Save best accuracy model
        if accuracy > best_valid_acc:
            best_valid_acc = accuracy
            best_model_report = classification_report(y_test, y_pred, zero_division=0)
            torch.save(model.state_dict(), f'./{train_prefix}checkpoints/big_{epoch}.pth')

        # Save the last model
        torch.save(model.state_dict(), f'./{train_prefix}checkpoints/last_model.pth')

        # report the best training model
        if epoch == epochs:
            output_file = open(train_prefix+'output.txt', 'a')
            output_file.write(f'End Training Overall Report')
            output_file.write('\n')
            output_file.write(f'Best Validation Accuracy: {best_valid_acc}')
            output_file.write('\n')
            output_file.write(f'Classification Report: {best_model_report}')
            output_file.write('\n')
            output_file.write(f'The best model is saved under resnet_attention.pth')
            output_file.close()


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
if train_prefix+"checkpoints" not in os.listdir("./"):
    os.mkdir(f"./{train_prefix}checkpoints")
else:
    print("Checkpoints folder is exist.")


In [None]:

cls_num=len(label_list)

train_data = ListDataset(train_csv, label_list, "train")
vali_data = ListDataset(valid_csv, label_list, "validation")

print('loading training data')
train_loader = DataLoader(train_data, batch_size, shuffle=True)
print('loading validation data')
vali_loader = DataLoader(vali_data, batch_size, shuffle=False)


In [None]:
input_channels = 1  # input data are 1 channel, mono
num_residual_blocks = 5  # B = 5
base_filters = 32  # P = 32
kernel_size = (5, 5)  # K = (5, 5)

# initialize model
model = ProposedModel(input_channels, num_residual_blocks, base_filters, kernel_size, cls_num)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model


In [None]:

# initialize optimizer és loss function
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#train
train(model, device, train_loader, vali_loader, epochs, loss_function, optimizer)

