In [1]:
import pandas as pd
from pathlib import Path

In [2]:
Catholic_file = 'breathdata/augment_train.csv'
df = pd.read_csv(Catholic_file)
df

Unnamed: 0,filename,category,class
0,0001-1.wav,wheezing,1
1,0001-2.wav,healthy,0
2,0001-3.wav,wheezing,1
3,0001-4.wav,wheezing,1
4,0002-1.wav,healthy,0
...,...,...,...
1465,minus_0614-3.wav,wheezing,1
1466,minus_0614-4.wav,wheezing,1
1467,minus_0615-1.wav,wheezing,1
1468,minus_0615-2.wav,wheezing,1


In [3]:
df['relative_path'] = '/' + df['filename'].astype(str)
df = df[['relative_path', 'class']]
df.head()

Unnamed: 0,relative_path,class
0,/0001-1.wav,1
1,/0001-2.wav,0
2,/0001-3.wav,1
3,/0001-4.wav,1
4,/0002-1.wav,0


In [4]:
data_path = 'breathdata/train'

In [5]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class Breath_sound_Util():
  
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)

    return (sig, sr)

  def resample(aud, newsr):
    sig, sr = aud
    
    if (sr == newsr):
     return aud

    num_channels = sig.shape[0]
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))
  

  def pad(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms
   
    if (sig_len > max_len):
      sig = sig[:,:max_len]
 
    elif (sig_len < max_len):

      repeated = []
      repeated.append(sig)
      required_len = max_len - sig_len

      while required_len > sig_len : 
        repeated.append(sig)
        require_len -= sig_len
      repeated.append(sig[:, :required_len])
 
      sig = torch.cat(repeated, 1)

    return (sig, sr)


  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec
    print(aug_spec)

In [6]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
import torchvision

class breathDS(Dataset):
    
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 44100
            
  def __len__(self):
    return len(self.df)    
    
  def __getitem__(self, idx):
    audio_file = self.data_path + self.df.loc[idx, 'relative_path']
    class_id = self.df.loc[idx, 'class']
    aud = Breath_sound_Util.open(audio_file)
    reaud = Breath_sound_Util.resample(aud, self.sr)
    dur_aud = Breath_sound_Util.pad(reaud, self.duration)
    sgram = Breath_sound_Util.spectro_gram(dur_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = Breath_sound_Util.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
    #aug_sgram = aug_sgram.view([-1, 64])
    
    
    return aug_sgram, class_id

In [7]:
from torch.utils.data import random_split

brds = breathDS(df, data_path)

In [8]:
train_dl = torch.utils.data.DataLoader(brds, batch_size=16, shuffle=True)

In [9]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_utils import do_mixup, interpolate, pad_framewise_output

In [11]:
import torch
from torch import nn
import torch.nn.functional as F

PRETRAINED_CHECKPOINT_PATH="/home/nextgen/Desktop/braeth/Cnn14_mAP=0.431.pth"   # Trained by a later code version, achieves higher mAP than the paper.
MODEL_TYPE="Cnn14"

def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)
            
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)
    
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):

        super(ConvBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=in_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3), stride=(1, 1),
                               padding=(1, 1), bias=False)

        self.conv2 = nn.Conv2d(in_channels=out_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3), stride=(1, 1),
                               padding=(1, 1), bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()

    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')

        return x

In [12]:
class Cnn14(nn.Module):
    def __init__(self, classes_num):
        
        super(Cnn14, self).__init__()

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        
        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)
 
    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        #x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        #x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        
        #x = input.unsqueeze(0)
        x = input.transpose(1, 3)
        x = x.transpose(1, 2)
        x = self.bn0(x)
        x = x.transpose(1, 3)


        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)

        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict


class Transfer_Cnn14(nn.Module):
    def __init__(self,classes_num, pretrain_checkpoint=None):
        super(Transfer_Cnn14, self).__init__()
        classes_num = 2
        audioset_classes_num = 527
        self.base = Cnn14(audioset_classes_num)

        #self.init_weights()

        if pretrain_checkpoint:
            self.load_from_pretrain(pretrain_checkpoint)
            
        self.base.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        # 안쓰이는이유는 multi-class clasification을 생략하기 때문,


    #def init_weights(self):
        #init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint):
        pretrained_checkpoint="Cnn14_mAP=0.431.pth"
        checkpoint = torch.load(pretrained_checkpoint)
        checkpoint['model'].pop('spectrogram_extractor.stft.conv_imag.weight')
        checkpoint['model'].pop('spectrogram_extractor.stft.conv_real.weight')
        checkpoint['model'].pop('logmel_extractor.melW')
        self.base.load_state_dict(checkpoint['model'])
    
    def forward(self, input, mixup_lambda=None):
        """Input: (batch_size, data_length)
        """
        output_dict = self.base(input, mixup_lambda)
        embedding = output_dict['embedding']

        clipwise_output =  torch.log_softmax(self.base.fc_audioset(embedding), dim=-1)
        #output_dict['clipwise_output'] = clipwise_output
 
        return clipwise_output

In [13]:
Transfer_Cnn14(classes_num=2,pretrain_checkpoint="Cnn14_mAP=0.431.pth")

Transfer_Cnn14(
  (base): Cnn14(
    (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_block1): ConvBlock(
      (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv_block2): ConvBlock(
      (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv_block3): ConvBlock(
      (conv1): Conv2d(128,

In [14]:
Model1=Transfer_Cnn14(classes_num=2,pretrain_checkpoint="Cnn14_mAP=0.431.pth")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Model1 = Model1.to(device)
next(Model1.parameters()).device

device(type='cuda', index=0)

In [15]:
from sklearn.metrics import confusion_matrix
import numpy as np
import sklearn.metrics as metrics
from tqdm import tqdm

def training(model, train_dl, num_epochs):
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  for epoch in tqdm(range(num_epochs)):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    for i, data in enumerate(train_dl):
        inputs, labels = data[0].to(device), data[1].to(device)
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()
        _, prediction = torch.max(outputs,1)
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]
        

    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}')

  print('Finished Training')
    
num_epochs=100
training(Model1, train_dl, num_epochs)

  1%|          | 1/100 [00:19<31:51, 19.31s/it]

Epoch: 0, Loss: 0.6177, Accuracy: 0.6864


  2%|▏         | 2/100 [00:38<31:15, 19.14s/it]

Epoch: 1, Loss: 0.4151, Accuracy: 0.8327


  3%|▎         | 3/100 [00:57<30:49, 19.06s/it]

Epoch: 2, Loss: 0.3203, Accuracy: 0.8912


  4%|▍         | 4/100 [01:16<30:23, 18.99s/it]

Epoch: 3, Loss: 0.2634, Accuracy: 0.9075


  5%|▌         | 5/100 [01:34<29:39, 18.73s/it]

Epoch: 4, Loss: 0.2126, Accuracy: 0.9252


  6%|▌         | 6/100 [01:52<29:06, 18.58s/it]

Epoch: 5, Loss: 0.2093, Accuracy: 0.9218


  7%|▋         | 7/100 [02:11<28:44, 18.54s/it]

Epoch: 6, Loss: 0.1584, Accuracy: 0.9395


  8%|▊         | 8/100 [02:29<28:23, 18.51s/it]

Epoch: 7, Loss: 0.1405, Accuracy: 0.9449


  9%|▉         | 9/100 [02:48<28:02, 18.49s/it]

Epoch: 8, Loss: 0.1317, Accuracy: 0.9551


 10%|█         | 10/100 [03:06<27:38, 18.43s/it]

Epoch: 9, Loss: 0.1107, Accuracy: 0.9612


 11%|█         | 11/100 [03:24<27:17, 18.40s/it]

Epoch: 10, Loss: 0.1056, Accuracy: 0.9680


 12%|█▏        | 12/100 [03:43<26:58, 18.39s/it]

Epoch: 11, Loss: 0.0793, Accuracy: 0.9728


 13%|█▎        | 13/100 [04:01<26:38, 18.38s/it]

Epoch: 12, Loss: 0.0879, Accuracy: 0.9673


 14%|█▍        | 14/100 [04:19<26:18, 18.36s/it]

Epoch: 13, Loss: 0.1025, Accuracy: 0.9667


 15%|█▌        | 15/100 [04:38<26:05, 18.42s/it]

Epoch: 14, Loss: 0.0791, Accuracy: 0.9701


 16%|█▌        | 16/100 [04:56<25:45, 18.40s/it]

Epoch: 15, Loss: 0.1146, Accuracy: 0.9646


 17%|█▋        | 17/100 [05:15<25:27, 18.40s/it]

Epoch: 16, Loss: 0.0888, Accuracy: 0.9762


 18%|█▊        | 18/100 [05:33<25:08, 18.39s/it]

Epoch: 17, Loss: 0.1183, Accuracy: 0.9612


 19%|█▉        | 19/100 [05:51<24:49, 18.39s/it]

Epoch: 18, Loss: 0.0946, Accuracy: 0.9694


 20%|██        | 20/100 [06:10<24:29, 18.37s/it]

Epoch: 19, Loss: 0.0963, Accuracy: 0.9653


 21%|██        | 21/100 [06:28<24:10, 18.36s/it]

Epoch: 20, Loss: 0.0946, Accuracy: 0.9694


 22%|██▏       | 22/100 [06:46<23:52, 18.36s/it]

Epoch: 21, Loss: 0.1210, Accuracy: 0.9680


 23%|██▎       | 23/100 [07:05<23:32, 18.34s/it]

Epoch: 22, Loss: 0.1089, Accuracy: 0.9701


 24%|██▍       | 24/100 [07:23<23:15, 18.36s/it]

Epoch: 23, Loss: 0.1133, Accuracy: 0.9660


 25%|██▌       | 25/100 [07:42<22:59, 18.40s/it]

Epoch: 24, Loss: 0.1109, Accuracy: 0.9810


 26%|██▌       | 26/100 [08:00<22:40, 18.38s/it]

Epoch: 25, Loss: 0.1132, Accuracy: 0.9680


 27%|██▋       | 27/100 [08:18<22:23, 18.41s/it]

Epoch: 26, Loss: 0.0991, Accuracy: 0.9707


 28%|██▊       | 28/100 [08:37<22:05, 18.41s/it]

Epoch: 27, Loss: 0.1093, Accuracy: 0.9721


 29%|██▉       | 29/100 [08:55<21:48, 18.42s/it]

Epoch: 28, Loss: 0.1141, Accuracy: 0.9605


 30%|███       | 30/100 [09:14<21:31, 18.45s/it]

Epoch: 29, Loss: 0.4387, Accuracy: 0.9000


 31%|███       | 31/100 [09:32<21:13, 18.46s/it]

Epoch: 30, Loss: 0.2544, Accuracy: 0.9313


 32%|███▏      | 32/100 [09:51<20:54, 18.44s/it]

Epoch: 31, Loss: 0.1301, Accuracy: 0.9646


 33%|███▎      | 33/100 [10:09<20:34, 18.43s/it]

Epoch: 32, Loss: 0.0953, Accuracy: 0.9646


 34%|███▍      | 34/100 [10:27<20:17, 18.45s/it]

Epoch: 33, Loss: 0.0410, Accuracy: 0.9830


 35%|███▌      | 35/100 [10:46<19:58, 18.44s/it]

Epoch: 34, Loss: 0.1155, Accuracy: 0.9755


 36%|███▌      | 36/100 [11:04<19:39, 18.43s/it]

Epoch: 35, Loss: 0.1369, Accuracy: 0.9537


 37%|███▋      | 37/100 [11:23<19:21, 18.44s/it]

Epoch: 36, Loss: 0.1226, Accuracy: 0.9748


 38%|███▊      | 38/100 [11:41<19:02, 18.43s/it]

Epoch: 37, Loss: 0.0537, Accuracy: 0.9830


 39%|███▉      | 39/100 [12:00<18:44, 18.44s/it]

Epoch: 38, Loss: 0.0686, Accuracy: 0.9850


 40%|████      | 40/100 [12:18<18:23, 18.40s/it]

Epoch: 39, Loss: 0.0396, Accuracy: 0.9878


 41%|████      | 41/100 [12:36<18:04, 18.38s/it]

Epoch: 40, Loss: 0.1790, Accuracy: 0.9721


 42%|████▏     | 42/100 [12:55<17:47, 18.40s/it]

Epoch: 41, Loss: 0.0554, Accuracy: 0.9857


 43%|████▎     | 43/100 [13:13<17:31, 18.46s/it]

Epoch: 42, Loss: 0.0371, Accuracy: 0.9850


 44%|████▍     | 44/100 [13:32<17:13, 18.46s/it]

Epoch: 43, Loss: 0.0158, Accuracy: 0.9939


 45%|████▌     | 45/100 [13:50<16:54, 18.44s/it]

Epoch: 44, Loss: 0.0311, Accuracy: 0.9905


 46%|████▌     | 46/100 [14:09<16:36, 18.46s/it]

Epoch: 45, Loss: 0.0120, Accuracy: 0.9973


 47%|████▋     | 47/100 [14:27<16:17, 18.44s/it]

Epoch: 46, Loss: 0.0157, Accuracy: 0.9925


 48%|████▊     | 48/100 [14:45<15:57, 18.42s/it]

Epoch: 47, Loss: 0.0279, Accuracy: 0.9946


 49%|████▉     | 49/100 [15:04<15:39, 18.43s/it]

Epoch: 48, Loss: 0.0142, Accuracy: 0.9959


 50%|█████     | 50/100 [15:22<15:21, 18.42s/it]

Epoch: 49, Loss: 0.0310, Accuracy: 0.9912


 51%|█████     | 51/100 [15:41<15:02, 18.42s/it]

Epoch: 50, Loss: 0.0177, Accuracy: 0.9946


 52%|█████▏    | 52/100 [15:59<14:45, 18.44s/it]

Epoch: 51, Loss: 0.0243, Accuracy: 0.9932


 53%|█████▎    | 53/100 [16:18<14:26, 18.45s/it]

Epoch: 52, Loss: 0.0382, Accuracy: 0.9918


 54%|█████▍    | 54/100 [16:36<14:10, 18.49s/it]

Epoch: 53, Loss: 0.0103, Accuracy: 0.9952


 55%|█████▌    | 55/100 [16:55<13:52, 18.49s/it]

Epoch: 54, Loss: 0.0176, Accuracy: 0.9932


 56%|█████▌    | 56/100 [17:13<13:32, 18.46s/it]

Epoch: 55, Loss: 0.0145, Accuracy: 0.9959


 57%|█████▋    | 57/100 [17:32<13:12, 18.44s/it]

Epoch: 56, Loss: 0.0217, Accuracy: 0.9925


 58%|█████▊    | 58/100 [17:50<12:53, 18.42s/it]

Epoch: 57, Loss: 0.0113, Accuracy: 0.9966


 59%|█████▉    | 59/100 [18:08<12:34, 18.41s/it]

Epoch: 58, Loss: 0.0069, Accuracy: 0.9980


 60%|██████    | 60/100 [18:27<12:17, 18.43s/it]

Epoch: 59, Loss: 0.0155, Accuracy: 0.9952


 61%|██████    | 61/100 [18:45<11:58, 18.43s/it]

Epoch: 60, Loss: 0.0140, Accuracy: 0.9952


 62%|██████▏   | 62/100 [19:04<11:39, 18.40s/it]

Epoch: 61, Loss: 0.0153, Accuracy: 0.9973


 63%|██████▎   | 63/100 [19:22<11:25, 18.53s/it]

Epoch: 62, Loss: 0.0378, Accuracy: 0.9918


 64%|██████▍   | 64/100 [19:41<11:06, 18.51s/it]

Epoch: 63, Loss: 0.0099, Accuracy: 0.9966


 65%|██████▌   | 65/100 [20:00<10:50, 18.57s/it]

Epoch: 64, Loss: 0.0112, Accuracy: 0.9966


 66%|██████▌   | 66/100 [20:18<10:29, 18.51s/it]

Epoch: 65, Loss: 0.0170, Accuracy: 0.9932


 67%|██████▋   | 67/100 [20:36<10:09, 18.46s/it]

Epoch: 66, Loss: 0.0136, Accuracy: 0.9939


 68%|██████▊   | 68/100 [20:55<09:50, 18.44s/it]

Epoch: 67, Loss: 0.0205, Accuracy: 0.9952


 69%|██████▉   | 69/100 [21:13<09:32, 18.48s/it]

Epoch: 68, Loss: 0.0064, Accuracy: 0.9966


 70%|███████   | 70/100 [21:32<09:18, 18.61s/it]

Epoch: 69, Loss: 0.0081, Accuracy: 0.9973


 71%|███████   | 71/100 [21:51<09:00, 18.63s/it]

Epoch: 70, Loss: 0.0044, Accuracy: 0.9973


 72%|███████▏  | 72/100 [22:10<08:42, 18.67s/it]

Epoch: 71, Loss: 0.0148, Accuracy: 0.9966


 73%|███████▎  | 73/100 [22:29<08:27, 18.79s/it]

Epoch: 72, Loss: 0.0080, Accuracy: 0.9973


 74%|███████▍  | 74/100 [22:48<08:12, 18.95s/it]

Epoch: 73, Loss: 0.0044, Accuracy: 0.9980


 75%|███████▌  | 75/100 [23:07<07:55, 19.01s/it]

Epoch: 74, Loss: 0.0033, Accuracy: 0.9986


 76%|███████▌  | 76/100 [23:26<07:36, 19.02s/it]

Epoch: 75, Loss: 0.0031, Accuracy: 0.9986


 77%|███████▋  | 77/100 [23:45<07:15, 18.95s/it]

Epoch: 76, Loss: 0.0050, Accuracy: 0.9980


 78%|███████▊  | 78/100 [24:03<06:53, 18.81s/it]

Epoch: 77, Loss: 0.0008, Accuracy: 1.0000


 79%|███████▉  | 79/100 [24:22<06:34, 18.77s/it]

Epoch: 78, Loss: 0.0061, Accuracy: 0.9973


 80%|████████  | 80/100 [24:41<06:14, 18.72s/it]

Epoch: 79, Loss: 0.0043, Accuracy: 0.9973


 81%|████████  | 81/100 [24:59<05:54, 18.65s/it]

Epoch: 80, Loss: 0.0045, Accuracy: 0.9966


 82%|████████▏ | 82/100 [25:18<05:38, 18.79s/it]

Epoch: 81, Loss: 0.0068, Accuracy: 0.9966


 83%|████████▎ | 83/100 [25:37<05:19, 18.82s/it]

Epoch: 82, Loss: 0.0081, Accuracy: 0.9973


 84%|████████▍ | 84/100 [25:56<05:00, 18.80s/it]

Epoch: 83, Loss: 0.0066, Accuracy: 0.9986


 85%|████████▌ | 85/100 [26:14<04:40, 18.69s/it]

Epoch: 84, Loss: 0.0018, Accuracy: 0.9993


 86%|████████▌ | 86/100 [26:33<04:21, 18.66s/it]

Epoch: 85, Loss: 0.0039, Accuracy: 0.9993


 87%|████████▋ | 87/100 [26:52<04:02, 18.66s/it]

Epoch: 86, Loss: 0.0007, Accuracy: 1.0000


 88%|████████▊ | 88/100 [27:10<03:44, 18.69s/it]

Epoch: 87, Loss: 0.0007, Accuracy: 1.0000


 89%|████████▉ | 89/100 [27:29<03:24, 18.60s/it]

Epoch: 88, Loss: 0.0037, Accuracy: 0.9986


 90%|█████████ | 90/100 [27:48<03:06, 18.69s/it]

Epoch: 89, Loss: 0.0016, Accuracy: 0.9993


 91%|█████████ | 91/100 [28:07<02:49, 18.89s/it]

Epoch: 90, Loss: 0.0017, Accuracy: 0.9993


 92%|█████████▏| 92/100 [28:26<02:31, 18.95s/it]

Epoch: 91, Loss: 0.0021, Accuracy: 0.9993


 93%|█████████▎| 93/100 [28:45<02:12, 18.88s/it]

Epoch: 92, Loss: 0.0002, Accuracy: 1.0000


 94%|█████████▍| 94/100 [29:03<01:52, 18.77s/it]

Epoch: 93, Loss: 0.0004, Accuracy: 1.0000


 95%|█████████▌| 95/100 [29:22<01:33, 18.72s/it]

Epoch: 94, Loss: 0.0020, Accuracy: 0.9993


 96%|█████████▌| 96/100 [29:40<01:14, 18.65s/it]

Epoch: 95, Loss: 0.0021, Accuracy: 0.9986


 97%|█████████▋| 97/100 [29:59<00:55, 18.65s/it]

Epoch: 96, Loss: 0.0027, Accuracy: 0.9993


 98%|█████████▊| 98/100 [30:18<00:37, 18.61s/it]

Epoch: 97, Loss: 0.0019, Accuracy: 0.9986


 99%|█████████▉| 99/100 [30:36<00:18, 18.62s/it]

Epoch: 98, Loss: 0.0017, Accuracy: 0.9993


100%|██████████| 100/100 [30:55<00:00, 18.55s/it]

Epoch: 99, Loss: 0.0011, Accuracy: 0.9993
Finished Training





In [16]:
Catholic_file1 = 'breathdata/test.csv'
df1 = pd.read_csv(Catholic_file1)
df1

Unnamed: 0,filename,category,class
0,0003-1.wav,healthy,0
1,0004-1.wav,healthy,0
2,0007-1.wav,healthy,0
3,0009-1.wav,healthy,0
4,0009-4.wav,healthy,0
5,0012-2.wav,healthy,0
6,0013-2.wav,healthy,0
7,0015-1.wav,healthy,0
8,0020-1.wav,wheezing,1
9,0020-3.wav,wheezing,1


In [17]:
df1['relative_path'] = '/' + df1['filename'].astype(str)
df1 = df1[['relative_path', 'class']]
df1.head()

Unnamed: 0,relative_path,class
0,/0003-1.wav,0
1,/0004-1.wav,0
2,/0007-1.wav,0
3,/0009-1.wav,0
4,/0009-4.wav,0


In [18]:
data_path = 'breathdata/test'

In [19]:
class Breath_sound_Util1():
  
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    
    return (sig, sr)

  def resample(aud, newsr):
    sig, sr = aud
    
    if (sr == newsr):
     return aud

    num_channels = sig.shape[0]
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))
  

  def pad(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms
   
    if (sig_len > max_len):
      sig = sig[:,:max_len]
 
    elif (sig_len < max_len):

      repeated = []
      repeated.append(sig)
      required_len = max_len - sig_len

      while required_len > sig_len : 
        repeated.append(sig)
        require_len -= sig_len
      repeated.append(sig[:, :required_len])
 
      sig = torch.cat(repeated, 1)

    return (sig, sr)

  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

In [20]:
class breathDS1(Dataset):
    
  def __init__(self, df1, data_path):
    self.df1 = df1
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 44100
            
  def __len__(self):
    return len(self.df1)    
    
  def __getitem__(self, idx):
    audio_file = self.data_path + self.df1.loc[idx, 'relative_path']
    class_id = self.df1.loc[idx, 'class']
    aud = Breath_sound_Util1.open(audio_file)
    reaud = Breath_sound_Util1.resample(aud, self.sr)
    dur_aud = Breath_sound_Util1.pad(reaud, self.duration)
    sgram = Breath_sound_Util1.spectro_gram(dur_aud, n_mels=64, n_fft=1024, hop_len=None)

    
    return sgram, class_id

In [21]:
brds1 = breathDS1(df1, data_path)

In [22]:
val_dl = torch.utils.data.DataLoader(brds1, batch_size=16, shuffle=True)

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

def inference (Model1, val_dl):
  correct_prediction = 0
  total_prediction = 0
  TP = 0 
  TN = 0
  FN = 0
  FP = 0
  a = []
  b = []
  with torch.no_grad():
    for i, data in enumerate(val_dl):
      
      inputs, labels = data[0].to(device), data[1].to(device)
      cpu_labels = data[1]
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s
      outputs = Model1(inputs)
 
      _, prediction = torch.max(outputs,1)
      
      cpu_prediction = prediction.to('cpu')
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
          

      TP += ((prediction == 1) & (labels == 1)).sum()
# TN predict and label are both 0
      TN += ((prediction == 0) & (labels == 0)).sum()
# FN    predict 0 label 1
      FN += ((prediction == 0) & (labels == 1)).sum()
# FP    predict 1 label 0
      FP += ((prediction == 1) & (labels == 0)).sum()

      c = cpu_labels.tolist()
      a.extend(c)

      d = cpu_prediction.tolist()
      b.extend(d)
    
    # TP : 휘징인데 휘징이라고 판단(잘함)
    # FP : 헬시인데 휘징이라고 판단(못함)
    # TN : 헬시인데 헬시라고 판단(잘함)
    # FN : 휘징인데 헬시라고 판단(못함)
    
    # POSITIVE : 1(휘징)이라고 판단
    # NEGATIVE : 0(헬시)라고 판단
    # TRUE : 정답값 1(휘징)
    # FALSE : 정답값 0(헬시)
    
    # PRECISION : 휘징이라고 판단한 것 중 진짜 휘징
    # RECALL : 휘징인데 휘징이라고 잘 판단
    

  precision = TP / (TP + FP)
  recall = TP / (TP + FN)
  F1 = 2 * recall * precision / (recall + precision)

  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.4f}, Total items: {total_prediction}')
  print(f'precision: {precision:.4f}, F1: {F1:.4f}')
  print(f'recall: {recall:.4f}')
  target_names = ['healthy', 'wheezing']
  print(classification_report(a, b, target_names = target_names))
  print("AUC:{}".format(roc_auc_score(a, b)))

inference(Model1, val_dl)

Accuracy: 0.7925, Total items: 53
precision: 0.7143, F1: 0.6452
recall: 0.5882
              precision    recall  f1-score   support

     healthy       0.82      0.89      0.85        36
    wheezing       0.71      0.59      0.65        17

    accuracy                           0.79        53
   macro avg       0.77      0.74      0.75        53
weighted avg       0.79      0.79      0.79        53

AUC:0.738562091503268
