In [1]:
import pandas as pd
from pathlib import Path

In [2]:
Catholic_file = 'aug_train_v2.2.csv'
df = pd.read_csv(Catholic_file)
df

Unnamed: 0,filename,category,class,sex,old
0,0002-1.wav,non-wheezing,0,1,7
1,0002-2.wav,non-wheezing,0,1,7
2,0002-3.wav,non-wheezing,0,1,7
3,0002-4.wav,non-wheezing,0,1,7
4,0003-2.wav,non-wheezing,0,0,1
...,...,...,...,...,...
1605,wn_0535-4.wav,wheezing,1,0,9
1606,wn_0588-1.wav,wheezing,1,0,1
1607,wn_0599-3.wav,wheezing,1,1,6
1608,wn_0602-2.wav,wheezing,1,1,8


In [3]:
mean = df.old.mean()
std = df.old.std()
df.old = (df.old - mean) / std

In [4]:
df

Unnamed: 0,filename,category,class,sex,old
0,0002-1.wav,non-wheezing,0,1,0.905217
1,0002-2.wav,non-wheezing,0,1,0.905217
2,0002-3.wav,non-wheezing,0,1,0.905217
3,0002-4.wav,non-wheezing,0,1,0.905217
4,0003-2.wav,non-wheezing,0,0,-0.948195
...,...,...,...,...,...
1605,wn_0535-4.wav,wheezing,1,0,1.523021
1606,wn_0588-1.wav,wheezing,1,0,-0.948195
1607,wn_0599-3.wav,wheezing,1,1,0.596315
1608,wn_0602-2.wav,wheezing,1,1,1.214119


In [5]:
df['relative_path'] = '/' + df['filename'].astype(str)
df = df[['relative_path', 'class', 'sex', 'old']]
df.head()

Unnamed: 0,relative_path,class,sex,old
0,/0002-1.wav,0,1,0.905217
1,/0002-2.wav,0,1,0.905217
2,/0002-3.wav,0,1,0.905217
3,/0002-4.wav,0,1,0.905217
4,/0003-2.wav,0,0,-0.948195


In [6]:
data_path = 'aug_train_v2.2'

In [7]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class Breath_sound_Util():
  
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    
    return (sig, sr)

  def resample(aud, newsr):
    sig, sr = aud
    
    if (sr == newsr):
     return aud

    num_channels = sig.shape[0]
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))
  

  def pad(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms
   
    if (sig_len > max_len):
      sig = sig[:,:max_len]
 
    elif (sig_len < max_len):

      repeated = []
      repeated.append(sig)
      required_len = max_len - sig_len

      while required_len > sig_len : 
        repeated.append(sig)
        require_len -= sig_len
      repeated.append(sig[:, :required_len])
 
      sig = torch.cat(repeated, 1)

    return (sig, sr)


  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec
    print(aug_spec)

In [8]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
import torchvision

class breathDS(Dataset):
    
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 48000
            
  def __len__(self):
    return len(self.df)    
    
  def __getitem__(self, idx):
    audio_file = self.data_path + self.df.loc[idx, 'relative_path']
    class_id = self.df.loc[idx, 'class']
    
    aud = Breath_sound_Util.open(audio_file)
    reaud = Breath_sound_Util.resample(aud, self.sr)
    dur_aud = Breath_sound_Util.pad(reaud, self.duration)
    sgram = Breath_sound_Util.spectro_gram(dur_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = Breath_sound_Util.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
    #여기부터 나이, 성별변수를 집어넣는 과정임
    x = self.df.loc[idx, 'old']
    y = self.df.loc[idx, 'sex']
    x = torch.from_numpy(np.asarray(x).reshape((1,)))
    y = torch.from_numpy(np.asarray(y).reshape((1,)))
    tabular = torch.cat((x, y), 0)
    tabular = tabular.float()
    #print(tabular)
    #x1 = ["sex", "old"]
    #x2 = x2.
    #x2 = x2.iloc[idx].values
    
    
    return aug_sgram, tabular, class_id

In [9]:
brds = breathDS(df, data_path)

In [10]:
train_dl = DataLoader(brds, batch_size=16, shuffle=True)

In [11]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init

# --------------------------------------------------------------
# 호흡음의 Healthy, Wheezing을 판단하는 Binary Classification Model
# --------------------------------------------------------------


class WhoWheezing(nn.Module):
    

    def __init__(self):
        super().__init__()
        conv_layers = []
        self.conv1 = nn.Conv2d(1, 8, kernel_size=(3, 3), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.2)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv1.weight, a=0.2)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv1.weight, a=0.2)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv1.weight, a=0.2)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.dropout = nn.Dropout(0.4)
        self.lin = nn.Linear(in_features=72, out_features=2)
        self.dropout = nn.Dropout(0.4)
        self.sigmoid = nn.Sigmoid()

        self.conv = nn.Sequential(*conv_layers)
 
        self.fc1 = nn.Linear(2, 10)
        self.fc2 = nn.Linear(10, 8)

    def forward(self, inputs, inputs2):
        inputs = self.conv(inputs)
        inputs = self.ap(inputs)
        inputs = inputs.view(inputs.shape[0], -1)
        inputs2 = self.fc1(inputs2)
        inputs2 = self.fc2(inputs2)
        #inputs가 원래있던 멜스펙트로그램이고
        #inputs2가 정형데이터임
        x = torch.cat((inputs, inputs2), 1)
        x = self.lin(x)
        x = self.dropout(x)
        inputs = self.sigmoid(x)
        return x       

    '''def forward(self, x):
        x = self.conv(x)
        x = self.ap(x)
        x = x.view(x.shape[0], -1)
        #print(x.size())
        #x2 = category
        #x2 = x2(self.fc1(x2))
        #x2 = x2(self.fc2(x2))
        #x = x1 + x2
        x = self.lin(x)
        x = self.dropout(x)
        x = self.sigmoid(x)
        return x'''
    
Model1 = WhoWheezing()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Model1 = Model1.to(device)
next(Model1.parameters()).device

device(type='cuda', index=0)

In [12]:
from sklearn.metrics import confusion_matrix
import numpy as np
import sklearn.metrics as metrics


def training(model, train_dl, num_epochs):
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    for i, data in enumerate(train_dl):
        inputs, inputs2, labels = data[0].to(device), data[1].to(device), data[2].to(device)
        #여기에도 inputs2를 추가시켜줫음. 이제 data[1]이 가리키는건 breathDS의 tablular임
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s
        optimizer.zero_grad()
        outputs = model(inputs, inputs2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()
        _, prediction = torch.max(outputs,1)
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}')

  print('Finished Training')
    
num_epochs=52
training(Model1, train_dl, num_epochs)

Epoch: 0, Loss: 0.5802, Accuracy: 0.7205
Epoch: 1, Loss: 0.5208, Accuracy: 0.7807
Epoch: 2, Loss: 0.4951, Accuracy: 0.8112
Epoch: 3, Loss: 0.4838, Accuracy: 0.8099
Epoch: 4, Loss: 0.4611, Accuracy: 0.8093
Epoch: 5, Loss: 0.4683, Accuracy: 0.8025
Epoch: 6, Loss: 0.4626, Accuracy: 0.8112
Epoch: 7, Loss: 0.4468, Accuracy: 0.8248
Epoch: 8, Loss: 0.4485, Accuracy: 0.8155
Epoch: 9, Loss: 0.4504, Accuracy: 0.8168
Epoch: 10, Loss: 0.4427, Accuracy: 0.8161
Epoch: 11, Loss: 0.4410, Accuracy: 0.8149
Epoch: 12, Loss: 0.4293, Accuracy: 0.8224
Epoch: 13, Loss: 0.4704, Accuracy: 0.8118
Epoch: 14, Loss: 0.4514, Accuracy: 0.8230
Epoch: 15, Loss: 0.4452, Accuracy: 0.8130
Epoch: 16, Loss: 0.4385, Accuracy: 0.8174
Epoch: 17, Loss: 0.4334, Accuracy: 0.8298
Epoch: 18, Loss: 0.4255, Accuracy: 0.8186
Epoch: 19, Loss: 0.4172, Accuracy: 0.8317
Epoch: 20, Loss: 0.4225, Accuracy: 0.8236
Epoch: 21, Loss: 0.4154, Accuracy: 0.8242
Epoch: 22, Loss: 0.4046, Accuracy: 0.8373
Epoch: 23, Loss: 0.4163, Accuracy: 0.8248
Ep

In [52]:
Catholic_file1 = 'test_v2.2.csv'
df1 = pd.read_csv(Catholic_file1)
df1

Unnamed: 0,filename,category,class,sex,old
0,0001-2.wav,non-wheezing,0,0,0
1,0003-1.wav,non-wheezing,0,0,1
2,0004-1.wav,non-wheezing,0,0,1
3,0004-4.wav,non-wheezing,0,0,1
4,0006-1.wav,non-wheezing,0,1,1
5,0007-1.wav,non-wheezing,0,0,7
6,0008-1.wav,non-wheezing,0,0,2
7,0009-1.wav,non-wheezing,0,0,3
8,0010-3.wav,non-wheezing,0,1,3
9,0012-1.wav,non-wheezing,0,0,2


In [53]:
mean = df1.old.mean()
std = df1.old.std()
df1.old = (df1.old - mean) / std

In [54]:
df1['relative_path'] = '/' + df1['filename'].astype(str)
df1 = df1[['relative_path', 'class', 'sex', 'old']]
df1.head()

Unnamed: 0,relative_path,class,sex,old
0,/0001-2.wav,0,0,-1.131772
1,/0003-1.wav,0,0,-0.809217
2,/0004-1.wav,0,0,-0.809217
3,/0004-4.wav,0,0,-0.809217
4,/0006-1.wav,0,1,-0.809217


In [55]:
data_path = 'test_v2.2'

In [56]:
class Breath_sound_Util1():
  
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    
    return (sig, sr)

  def resample(aud, newsr):
    sig, sr = aud
    
    if (sr == newsr):
     return aud

    num_channels = sig.shape[0]
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))
  

  def pad(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms
   
    if (sig_len > max_len):
      sig = sig[:,:max_len]
 
    elif (sig_len < max_len):

      repeated = []
      repeated.append(sig)
      required_len = max_len - sig_len

      while required_len > sig_len : 
        repeated.append(sig)
        require_len -= sig_len
      repeated.append(sig[:, :required_len])
 
      sig = torch.cat(repeated, 1)

    return (sig, sr)

  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

In [61]:
class breathDS1(Dataset):
    
  def __init__(self, df1, data_path):
    self.df1 = df1
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 48000
            
  def __len__(self):
    return len(self.df1)    
    
  def __getitem__(self, idx):
    audio_file = self.data_path + self.df1.loc[idx, 'relative_path']
    class_id = self.df1.loc[idx, 'class']
    
    aud = Breath_sound_Util.open(audio_file)
    reaud = Breath_sound_Util.resample(aud, self.sr)
    dur_aud = Breath_sound_Util.pad(reaud, self.duration)
    sgram = Breath_sound_Util.spectro_gram(dur_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = Breath_sound_Util.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
    #여기부터 나이, 성별변수를 집어넣는 과정임
    x = self.df1.loc[idx, 'old']
    y = self.df1.loc[idx, 'sex']
    x = torch.from_numpy(np.asarray(x).reshape((1,)))
    y = torch.from_numpy(np.asarray(y).reshape((1,)))
    tabular = torch.cat((x, y), 0)
    tabular = tabular.float()
    #print(tabular)
    #x1 = ["sex", "old"]
    #x2 = x2.
    #x2 = x2.iloc[idx].values
    
    
    return aug_sgram, tabular, class_id

In [62]:
brds1 = breathDS1(df1, data_path)

In [63]:
val_dl = torch.utils.data.DataLoader(brds1, batch_size=16, shuffle=True)

In [78]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

def inference (model, val_dl):
  correct_prediction = 0
  total_prediction = 0
  TP = 0 
  TN = 0
  FN = 0
  FP = 0
  a = []
  b = []
  with torch.no_grad():
    for i, data in enumerate(val_dl):
      
      inputs, inputs2, labels = data[0].to(device), data[1].to(device), data[2].to(device)
      cpu_labels = data[2]
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s
      outputs = model(inputs, inputs2)
 
      _, prediction = torch.max(outputs,1)
      
      cpu_prediction = prediction.to('cpu')
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
          

      TP += ((prediction == 1) & (labels == 1)).sum()
# TN predict and label are both 0
      TN += ((prediction == 0) & (labels == 0)).sum()
# FN    predict 0 label 1
      FN += ((prediction == 0) & (labels == 1)).sum()
# FP    predict 1 label 0
      FP += ((prediction == 1) & (labels == 0)).sum()

      c = cpu_labels.tolist()
      a.extend(c)

      d = cpu_prediction.tolist()
      b.extend(d)
    
    # TP : 휘징인데 휘징이라고 판단(잘함)
    # FP : 헬시인데 휘징이라고 판단(못함)
    # TN : 헬시인데 헬시라고 판단(잘함)
    # FN : 휘징인데 헬시라고 판단(못함)
    
    # POSITIVE : 1(휘징)이라고 판단
    # NEGATIVE : 0(헬시)라고 판단
    # TRUE : 정답값 1(휘징)
    # FALSE : 정답값 0(헬시)
    
    # PRECISION : 휘징이라고 판단한 것 중 진짜 휘징
    # RECALL : 휘징인데 휘징이라고 잘 판단
    

  precision = TP / (TP + FP)
  recall = TP / (TP + FN)
  F1 = 2 * recall * precision / (recall + precision)

  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.4f}, Total items: {total_prediction}')
  print(f'precision: {precision:.4f}, F1: {F1:.4f}')
  print(f'recall: {recall:.4f}')
  target_names = ['healthy', 'wheezing']
  print(classification_report(a, b, target_names = target_names))
  print("AUC:{}".format(roc_auc_score(a, b)))

inference(Model1, val_dl)

Accuracy: 0.8596, Total items: 57
precision: 0.8421, F1: 0.8000
recall: 0.7619
              precision    recall  f1-score   support

     healthy       0.87      0.92      0.89        36
    wheezing       0.84      0.76      0.80        21

    accuracy                           0.86        57
   macro avg       0.86      0.84      0.85        57
weighted avg       0.86      0.86      0.86        57

AUC:0.8392857142857142
