In [39]:
from os import listdir
from os.path import isfile, join
import os
import re
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from timeit import default_timer as timer
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
import kagglehub

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import regularizers, optimizers
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Activation, MaxPooling1D, Dropout
from tensorflow.keras.utils import plot_model,to_categorical

# CLEANING: Mendeley Dataset

In [None]:
# extract labels for Mendeley dataset
path_mendeley = "audioDataMendeley/Audio files"

keep_upper = {"COPD", "BRON"}

# Detect diagnosis text in the filename and format it properly.
def format_diagnosis_from_filename(filename):
    name, ext = os.path.splitext(filename)
    
    # Split on first underscore
    parts = name.split("_", 1)
    if len(parts) > 1:
        prefix, diag_segment = parts[0], parts[1]
        
        # Split the diagnosis segment by commas
        diag_parts = diag_segment.split(",")
        formatted_diag_parts = []
        for part in diag_parts:
            part_clean = part.strip()
            if part_clean.upper() in keep_upper:
                formatted_diag_parts.append(part_clean.upper())
            elif part_clean == "N":
                formatted_diag_parts.append("Healthy")
            else:
                formatted_diag_parts.append(part_clean.title())
        
        # Rejoin with commas
        new_diag_segment = ",".join(formatted_diag_parts)
        new_name = f"{prefix}_{new_diag_segment}"
    else:
        new_name = name
    
    return new_name + ext

# go through directory and only keep necessary files
mendeley_labels = []
data_mendeley = []
for file in os.listdir(path_mendeley):
    if file.endswith(".wav"):
        old_path = os.path.join(path_mendeley, file)

        if "heart failure" in file.lower() or "and" in file.lower():
            os.remove(old_path)
            continue

        new_filename = format_diagnosis_from_filename(file)
        new_path = os.path.join(path_mendeley, new_filename)
        
        # Rename the file
        os.rename(old_path, new_path)

        # make patient diagnosis csv
        filename_parts = file.split("_")
        patient_id_pattern = r"\d+"
        patient_diagnosis_pattern = r"[a-zA-z]+"

        # extract patient id and diagnosis from filename 
        match_id = re.search(patient_id_pattern,filename_parts[0])
        match_diagnosis = re.search(patient_diagnosis_pattern, filename_parts[1])
        if match_id:
            patient_id_mendeley = int(match_id.group())
        if match_diagnosis:
            patient_diagnosis_mendeley = str(match_diagnosis.group())
        
        data_mendeley.append([patient_id_mendeley,patient_diagnosis_mendeley])
        
# export csv
patient_diagnosis_df_mendeley = pd.DataFrame(data_mendeley)
patient_diagnosis_df_mendeley.to_csv('audioDataMendeley/patient_diagnosis.csv', index=False)

# MAPPING DATASETS

In [80]:
# Map Kaggle labels to unified set
label_map = {
    "URTI": "URTI",
    "Healthy": "Healthy",
    "Asthma": "Asthma",
    "COPD": "COPD",
    "LRTI": "LRTI",
    "Pneumonia": "Pneumonia",
    "Bronchiectasis": "BRON",
    "Bronchiolitis": "BRON",
    "BRON": "BRON",
    "Lung Fibrosis":"Lung Fibrosis",
    "Plueral Effusion":"Plueral Effusion"
}

In [68]:
class Diagnosis():
  def __init__ (self, id, diagnosis, image_path):
    self.id = id
    self.diagnosis = diagnosis 
    self.image_path = image_path   

# PARSING DATASET: Mendeley

In [88]:
diagnosis_path_mendeley = "audioDataMendeley/Audio Files"

audio_path_mendeley = "audioDataMendeley/Audio Files"
files_mendeley =[]
for file in os.listdir(audio_path_mendeley):
    if file.endswith(".wav"):
        files_mendeley.append(file)

files = sorted(files_mendeley)

In [95]:
for file in os.listdir(audio_path_mendeley):
    filename_parts = file.split("_")
    patient_id_pattern = r"\d+"
    print(filename_parts[1])

COPD,E W,P R L,42,M.wav
Asthma,E W,P R L,21,F.wav
Asthma,E W,P R M,43,F.wav
Healthy,Healthy,P L L,24,M.wav
COPD,E W,P R L,63,M.wav
COPD,E W,P R L,58,F.wav
Lung Fibrosis,Crep,P L L,76,F.wav
Healthy,Healthy,P R M,21,M.wav
Asthma,E W,P L U,41,F.wav
Pneumonia,Crep,P R M,51,M.wav
Asthma,E W,P R U,49,F.wav
Lung Fibrosis,Crep,P L L,76,F.wav
Healthy,Healthy,P L U,33,M.wav
Healthy,Healthy,P L M,70,M.wav
Asthma And Lung Fibrosis,C,A R M,90,M.wav
Asthma,E W,A R U,46,M.wav
Healthy,Healthy,P R U,68,F.wav
Healthy,Healthy,P L L,53,M.wav
Asthma,E W,P L M,60,M.wav
BRON,Crep,P R L,20,M.wav
Healthy,Healthy,P L M,30,M.wav
Asthma,E W,P R M,53,F.wav
Healthy,Healthy,P R U,36,M.wav
Pneumonia,C,P R U,57,M.wav
Asthma,E W,P L U,45,F.wav
Healthy,Healthy,P L U,52,F.wav
BRON,Crep,P L U,68,F.wav
Healthy,Healthy,P L L,32,M.wav
Healthy,Healthy,P R M,74,M.wav
Healthy,Healthy,P L M,26,M.wav
Asthma,E W,P L M,40,M.wav
Healthy,Healthy,P L L,41,M.wav
Healthy,Healthy,P L U,73,F.wav
Healthy,Healthy,P L M,18,M.wav
Asthma,E W,P

# PARSING DATASET: Kaggle

In [56]:
# Download latest version of the database 
path = kagglehub.dataset_download("vbookshelf/respiratory-sound-database")

print("Path to dataset files:", path)

Path to dataset files: /Users/katherinebeaty/.cache/kagglehub/datasets/vbookshelf/respiratory-sound-database/versions/2


In [61]:
# extract lables for kaggle dataset
# Path to patient diagnosis CSV
diagnosis_path_kaggle = os.path.join(path, "Respiratory_Sound_Database/Respiratory_Sound_Database/patient_diagnosis.csv")

audio_path_kaggle = os.path.join(path, "Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files/")
files_kaggle = []
for file in os.listdir(audio_path_kaggle):
    if file.endswith(".wav"):
        files_kaggle.append(file)

files = sorted(files_kaggle)

In [81]:
diag_dict_kaggle = { 101 : "URTI"}  
diagnosis_list_kaggle = []
diagnosis_kaggle = pd.read_csv(diagnosis_path_kaggle)
  
for index , row in diagnosis_kaggle.iterrows():
    diag_dict_kaggle[row.iloc[0]] = row.iloc[1]

for c, f in enumerate(files_kaggle):
    patient_id = int(f.split('_')[0])
    raw_label = diag_dict_kaggle.get(patient_id, "Unknown")
    diagnosis_list_kaggle.append(Diagnosis(
        id=c,
        diagnosis=label_map.get(raw_label, "Unknown"),
        image_path=join(audio_path_kaggle, f)
    ))