In [1]:
import pandas as pd
from pathlib import Path

## Training DataSet

In [2]:
Catholic_file = 'breathdata/augment_train.csv'
df = pd.read_csv(Catholic_file)
df

Unnamed: 0,filename,category,class
0,0001-1.wav,wheezing,1
1,0001-2.wav,healthy,0
2,0001-3.wav,wheezing,1
3,0001-4.wav,wheezing,1
4,0002-1.wav,healthy,0
...,...,...,...
1465,minus_0614-3.wav,wheezing,1
1466,minus_0614-4.wav,wheezing,1
1467,minus_0615-1.wav,wheezing,1
1468,minus_0615-2.wav,wheezing,1


In [3]:
df['relative_path'] = '/' + df['filename'].astype(str)
df = df[['relative_path', 'class']]
df.head()

Unnamed: 0,relative_path,class
0,/0001-1.wav,1
1,/0001-2.wav,0
2,/0001-3.wav,1
3,/0001-4.wav,1
4,/0002-1.wav,0


In [4]:
data_path = 'breathdata/train'

In [5]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class Breath_sound_Util():
  
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    sig = torchaudio.functional.gain(sig, gain_db=5.0)
    sig = torchaudio.functional.lowpass_biquad(sig, sr, cutoff_freq=3000)
    sig = torchaudio.functional.highpass_biquad(sig, sr, cutoff_freq=2000)
    
    return (sig, sr)

  def resample(aud, newsr):
    sig, sr = aud
    
    if (sr == newsr):
     return aud

    num_channels = sig.shape[0]
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))
  

  def pad(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms
   
    if (sig_len > max_len):
      sig = sig[:,:max_len]
 
    elif (sig_len < max_len):

      repeated = []
      repeated.append(sig)
      required_len = max_len - sig_len

      while required_len > sig_len : 
        repeated.append(sig)
        require_len -= sig_len
      repeated.append(sig[:, :required_len])
 
      sig = torch.cat(repeated, 1)

    return (sig, sr)


  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec
    print(aug_spec)

In [6]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
import torchvision

class breathDS(Dataset):
    
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 48000
            
  def __len__(self):
    return len(self.df)    
    
  def __getitem__(self, idx):
    audio_file = self.data_path + self.df.loc[idx, 'relative_path']
    class_id = self.df.loc[idx, 'class']
    aud = Breath_sound_Util.open(audio_file)
    reaud = Breath_sound_Util.resample(aud, self.sr)
    dur_aud = Breath_sound_Util.pad(reaud, self.duration)
    sgram = Breath_sound_Util.spectro_gram(dur_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = Breath_sound_Util.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    
    return aug_sgram, class_id

In [7]:
from torch.utils.data import random_split

brds = breathDS(df, data_path)

In [8]:
train_dl = torch.utils.data.DataLoader(brds, batch_size=32, shuffle=True)

In [9]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init

# --------------------------------------------------------------
# 호흡음의 Healthy, Wheezing을 판단하는 Binary Classification Model
# --------------------------------------------------------------

class WhoWheezing(nn.Module):

    def __init__(self):
        super().__init__()
        conv_layers = []
        self.conv1 = nn.Conv2d(1, 8, kernel_size=(3, 3), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.2)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv1.weight, a=0.2)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv1.weight, a=0.2)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv1.weight, a=0.2)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.dropout = nn.Dropout(0.4)
        self.lin = nn.Linear(in_features=64, out_features=2)
        self.dropout = nn.Dropout(0.4)
        self.sigmoid = nn.Sigmoid()

        self.conv = nn.Sequential(*conv_layers)
 
    def forward(self, x):
        x = self.conv(x)
        x = self.ap(x)
        x = x.view(x.shape[0], -1)
        x = self.lin(x)
        x = self.dropout(x)
        x = self.sigmoid(x)
        return x

Model1 = WhoWheezing()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Model1 = Model1.to(device)
next(Model1.parameters()).device

device(type='cuda', index=0)

In [10]:
from sklearn.metrics import confusion_matrix
import numpy as np
import sklearn.metrics as metrics
from tqdm import tqdm

def training(model, train_dl, num_epochs):
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  for epoch in tqdm(range(num_epochs)):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    for i, data in enumerate(train_dl):
        inputs, labels = data[0].to(device), data[1].to(device)
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()
        _, prediction = torch.max(outputs,1)
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]
        

    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}')

  print('Finished Training')
    
num_epochs=52
training(Model1, train_dl, num_epochs)

  2%|▏         | 1/52 [01:11<1:01:01, 71.79s/it]

Epoch: 0, Loss: 0.5873, Accuracy: 0.7830


  4%|▍         | 2/52 [02:26<1:01:22, 73.65s/it]

Epoch: 1, Loss: 0.5356, Accuracy: 0.8463


  6%|▌         | 3/52 [03:44<1:01:37, 75.46s/it]

Epoch: 2, Loss: 0.5198, Accuracy: 0.8578


  8%|▊         | 4/52 [04:56<59:22, 74.22s/it]  

Epoch: 3, Loss: 0.5045, Accuracy: 0.8714


 10%|▉         | 5/52 [06:05<56:35, 72.25s/it]

Epoch: 4, Loss: 0.5133, Accuracy: 0.8626


 12%|█▏        | 6/52 [07:20<56:13, 73.34s/it]

Epoch: 5, Loss: 0.5099, Accuracy: 0.8646


 13%|█▎        | 7/52 [08:32<54:30, 72.68s/it]

Epoch: 6, Loss: 0.5082, Accuracy: 0.8680


 15%|█▌        | 8/52 [09:39<52:08, 71.10s/it]

Epoch: 7, Loss: 0.5016, Accuracy: 0.8803


 17%|█▋        | 9/52 [10:48<50:22, 70.29s/it]

Epoch: 8, Loss: 0.5089, Accuracy: 0.8585


 19%|█▉        | 10/52 [12:05<50:36, 72.30s/it]

Epoch: 9, Loss: 0.5122, Accuracy: 0.8619


 21%|██        | 11/52 [13:13<48:38, 71.18s/it]

Epoch: 10, Loss: 0.5109, Accuracy: 0.8673


 23%|██▎       | 12/52 [14:29<48:24, 72.62s/it]

Epoch: 11, Loss: 0.5099, Accuracy: 0.8660


 25%|██▌       | 13/52 [15:40<46:46, 71.95s/it]

Epoch: 12, Loss: 0.5063, Accuracy: 0.8660


 27%|██▋       | 14/52 [17:01<47:17, 74.67s/it]

Epoch: 13, Loss: 0.5028, Accuracy: 0.8741


 29%|██▉       | 15/52 [18:17<46:24, 75.25s/it]

Epoch: 14, Loss: 0.5139, Accuracy: 0.8633


 31%|███       | 16/52 [19:33<45:11, 75.33s/it]

Epoch: 15, Loss: 0.5009, Accuracy: 0.8741


 33%|███▎      | 17/52 [20:45<43:25, 74.44s/it]

Epoch: 16, Loss: 0.5074, Accuracy: 0.8660


 35%|███▍      | 18/52 [21:58<41:50, 73.85s/it]

Epoch: 17, Loss: 0.5119, Accuracy: 0.8599


 37%|███▋      | 19/52 [23:10<40:24, 73.46s/it]

Epoch: 18, Loss: 0.4985, Accuracy: 0.8741


 38%|███▊      | 20/52 [24:23<39:01, 73.17s/it]

Epoch: 19, Loss: 0.5070, Accuracy: 0.8714


 40%|████      | 21/52 [25:35<37:41, 72.94s/it]

Epoch: 20, Loss: 0.5024, Accuracy: 0.8728


 42%|████▏     | 22/52 [26:47<36:16, 72.56s/it]

Epoch: 21, Loss: 0.5041, Accuracy: 0.8707


 44%|████▍     | 23/52 [27:49<33:32, 69.40s/it]

Epoch: 22, Loss: 0.5013, Accuracy: 0.8769


 46%|████▌     | 24/52 [28:51<31:21, 67.21s/it]

Epoch: 23, Loss: 0.5051, Accuracy: 0.8673


 48%|████▊     | 25/52 [29:51<29:16, 65.06s/it]

Epoch: 24, Loss: 0.5068, Accuracy: 0.8721


 50%|█████     | 26/52 [30:52<27:37, 63.77s/it]

Epoch: 25, Loss: 0.5097, Accuracy: 0.8687


 52%|█████▏    | 27/52 [31:54<26:22, 63.30s/it]

Epoch: 26, Loss: 0.5071, Accuracy: 0.8728


 54%|█████▍    | 28/52 [32:51<24:32, 61.36s/it]

Epoch: 27, Loss: 0.5119, Accuracy: 0.8694


 56%|█████▌    | 29/52 [33:46<22:52, 59.66s/it]

Epoch: 28, Loss: 0.5088, Accuracy: 0.8701


 58%|█████▊    | 30/52 [34:47<21:56, 59.82s/it]

Epoch: 29, Loss: 0.5008, Accuracy: 0.8769


 60%|█████▉    | 31/52 [35:45<20:44, 59.25s/it]

Epoch: 30, Loss: 0.5088, Accuracy: 0.8762


 62%|██████▏   | 32/52 [36:46<19:59, 59.99s/it]

Epoch: 31, Loss: 0.5061, Accuracy: 0.8667


 63%|██████▎   | 33/52 [37:45<18:50, 59.52s/it]

Epoch: 32, Loss: 0.5090, Accuracy: 0.8714


 65%|██████▌   | 34/52 [38:44<17:51, 59.55s/it]

Epoch: 33, Loss: 0.5049, Accuracy: 0.8687


 67%|██████▋   | 35/52 [39:47<17:08, 60.52s/it]

Epoch: 34, Loss: 0.5070, Accuracy: 0.8823


 69%|██████▉   | 36/52 [40:37<15:15, 57.23s/it]

Epoch: 35, Loss: 0.5024, Accuracy: 0.8667


 71%|███████   | 37/52 [41:34<14:20, 57.35s/it]

Epoch: 36, Loss: 0.5080, Accuracy: 0.8646


 73%|███████▎  | 38/52 [42:25<12:56, 55.44s/it]

Epoch: 37, Loss: 0.5022, Accuracy: 0.8735


 75%|███████▌  | 39/52 [43:16<11:44, 54.17s/it]

Epoch: 38, Loss: 0.5015, Accuracy: 0.8707


 77%|███████▋  | 40/52 [44:14<11:02, 55.24s/it]

Epoch: 39, Loss: 0.5013, Accuracy: 0.8701


 79%|███████▉  | 41/52 [44:59<09:34, 52.21s/it]

Epoch: 40, Loss: 0.5012, Accuracy: 0.8748


 81%|████████  | 42/52 [45:55<08:51, 53.12s/it]

Epoch: 41, Loss: 0.4971, Accuracy: 0.8782


 83%|████████▎ | 43/52 [46:44<07:49, 52.15s/it]

Epoch: 42, Loss: 0.5006, Accuracy: 0.8769


 85%|████████▍ | 44/52 [47:31<06:44, 50.55s/it]

Epoch: 43, Loss: 0.4980, Accuracy: 0.8816


 87%|████████▋ | 45/52 [48:23<05:56, 50.99s/it]

Epoch: 44, Loss: 0.4946, Accuracy: 0.8891


 88%|████████▊ | 46/52 [49:10<04:58, 49.78s/it]

Epoch: 45, Loss: 0.5080, Accuracy: 0.8680


 90%|█████████ | 47/52 [50:05<04:15, 51.16s/it]

Epoch: 46, Loss: 0.4980, Accuracy: 0.8850


 92%|█████████▏| 48/52 [50:52<03:19, 49.90s/it]

Epoch: 47, Loss: 0.5032, Accuracy: 0.8776


 94%|█████████▍| 49/52 [51:46<02:33, 51.31s/it]

Epoch: 48, Loss: 0.5062, Accuracy: 0.8741


 96%|█████████▌| 50/52 [52:35<01:41, 50.61s/it]

Epoch: 49, Loss: 0.4973, Accuracy: 0.8823


 98%|█████████▊| 51/52 [53:30<00:51, 51.95s/it]

Epoch: 50, Loss: 0.4992, Accuracy: 0.8789


100%|██████████| 52/52 [54:18<00:00, 62.67s/it]

Epoch: 51, Loss: 0.5001, Accuracy: 0.8748
Finished Training





In [54]:
torch.save(Model1, 'breath/model_cv1.pth')
# 학습한 모델 저장

## Test DataSet

In [12]:
Catholic_file1 = 'breathdata/test.csv'
df1 = pd.read_csv(Catholic_file1)
df1

Unnamed: 0,filename,category,class
0,0003-1.wav,healthy,0
1,0004-1.wav,healthy,0
2,0007-1.wav,healthy,0
3,0009-1.wav,healthy,0
4,0009-4.wav,healthy,0
5,0012-2.wav,healthy,0
6,0013-2.wav,healthy,0
7,0015-1.wav,healthy,0
8,0020-1.wav,wheezing,1
9,0020-3.wav,wheezing,1


In [13]:
df1['relative_path'] = '/' + df1['filename'].astype(str)
df1 = df1[['relative_path', 'class']]
df1.head()

Unnamed: 0,relative_path,class
0,/0003-1.wav,0
1,/0004-1.wav,0
2,/0007-1.wav,0
3,/0009-1.wav,0
4,/0009-4.wav,0


In [14]:
data_path = 'breathdata/test'

In [15]:
class Breath_sound_Util1():
  
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    
    return (sig, sr)

  def resample(aud, newsr):
    sig, sr = aud
    
    if (sr == newsr):
     return aud

    num_channels = sig.shape[0]
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))
  

  def pad(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms
   
    if (sig_len > max_len):
      sig = sig[:,:max_len]
 
    elif (sig_len < max_len):

      repeated = []
      repeated.append(sig)
      required_len = max_len - sig_len

      while required_len > sig_len : 
        repeated.append(sig)
        require_len -= sig_len
      repeated.append(sig[:, :required_len])
 
      sig = torch.cat(repeated, 1)

    return (sig, sr)

  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    aud = torchaudio.transforms.TimeStretch(hop_length=2048, fixed_rate=5000000)
    top_db = 80
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

#이상한애들 타임시프트랑 / 마스크랑 / 이상한애들 다뺌

In [16]:
class breathDS1(Dataset):
    
  def __init__(self, df1, data_path):
    self.df1 = df1
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 48000
            
  def __len__(self):
    return len(self.df1)    
    
  def __getitem__(self, idx):
    audio_file = self.data_path + self.df1.loc[idx, 'relative_path']
    class_id = self.df1.loc[idx, 'class']
    aud = Breath_sound_Util1.open(audio_file)
    reaud = Breath_sound_Util1.resample(aud, self.sr)
    dur_aud = Breath_sound_Util1.pad(reaud, self.duration)
    sgram = Breath_sound_Util1.spectro_gram(dur_aud, n_mels=64, n_fft=1024, hop_len=None)

    
    return sgram, class_id

In [17]:
brds1 = breathDS1(df1, data_path)

In [18]:
val_dl = torch.utils.data.DataLoader(brds1, batch_size=32, shuffle=True)

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

def inference (model, val_dl):
  correct_prediction = 0
  total_prediction = 0
  TP = 0 
  TN = 0
  FN = 0
  FP = 0
  a = []
  b = []
  with torch.no_grad():
    for i, data in enumerate(val_dl):
      
      inputs, labels = data[0].to(device), data[1].to(device)
      cpu_labels = data[1]
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s
      outputs = model(inputs)
 
      _, prediction = torch.max(outputs,1)
      
      cpu_prediction = prediction.to('cpu')
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
          

      TP += ((prediction == 1) & (labels == 1)).sum()
# TN predict and label are both 0
      TN += ((prediction == 0) & (labels == 0)).sum()
# FN    predict 0 label 1
      FN += ((prediction == 0) & (labels == 1)).sum()
# FP    predict 1 label 0
      FP += ((prediction == 1) & (labels == 0)).sum()

      c = cpu_labels.tolist()
      a.extend(c)

      d = cpu_prediction.tolist()
      b.extend(d)
    
    # TP : 휘징인데 휘징이라고 판단(잘함)
    # FP : 헬시인데 휘징이라고 판단(못함)
    # TN : 헬시인데 헬시라고 판단(잘함)
    # FN : 휘징인데 헬시라고 판단(못함)
    
    # POSITIVE : 1(휘징)이라고 판단
    # NEGATIVE : 0(헬시)라고 판단
    # TRUE : 정답값 1(휘징)
    # FALSE : 정답값 0(헬시)
    
    # PRECISION : 휘징이라고 판단한 것 중 진짜 휘징
    # RECALL : 휘징인데 휘징이라고 잘 판단
    

  precision = TP / (TP + FP)
  recall = TP / (TP + FN)
  F1 = 2 * recall * precision / (recall + precision)

  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.4f}, Total items: {total_prediction}')
  print(f'precision: {precision:.4f}, F1: {F1:.4f}')
  print(f'recall: {recall:.4f}')
  target_names = ['healthy', 'wheezing']
  print(classification_report(a, b, target_names = target_names))
  print("AUC:{}".format(roc_auc_score(a, b)))

inference(Model1, val_dl)

Accuracy: 0.8868, Total items: 53
precision: 0.8667, F1: 0.8125
recall: 0.7647
              precision    recall  f1-score   support

     healthy       0.89      0.94      0.92        36
    wheezing       0.87      0.76      0.81        17

    accuracy                           0.89        53
   macro avg       0.88      0.85      0.87        53
weighted avg       0.89      0.89      0.88        53

AUC:0.8545751633986928
