In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
patient_data=pd.read_csv('/kaggle/input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/patient_diagnosis.csv',names=['pid','disease'])

In [None]:
patient_data.head()

In [None]:
df=pd.read_csv('/kaggle/input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files/160_1b3_Al_mc_AKGC417L.txt',sep='\t')
df.head()

In [None]:
import os
path='/kaggle/input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files/'
files=[s.split('.')[0] for s in os.listdir(path) if '.txt' in s]
files[:5]

In [None]:
def getFilenameInfo(file):
    return file.split('_')

In [None]:
getFilenameInfo('160_1b3_Al_mc_AKGC417L')

In [None]:
files_data=[]
for file in files:
    data=pd.read_csv(path + file + '.txt',sep='\t',names=['start','end','crackles','weezels'])
    name_data=getFilenameInfo(file)
    data['pid']=name_data[0]
    data['mode']=name_data[-2]
    data['filename']=file
    files_data.append(data)
files_df=pd.concat(files_data)
files_df.reset_index()
files_df.head()

In [None]:
patient_data.info()

In [None]:
files_df.info()

In [None]:
patient_data.pid=patient_data.pid.astype('int32')
files_df.pid=files_df.pid.astype('int32')

In [None]:
data=pd.merge(files_df,patient_data,on='pid')
data.head()

In [None]:
groups = data.groupby(['disease'])

for name, group in groups:
    print(name)
    print(group.shape[0])
    print('\n')

In [None]:
data = data[data['disease'] != 'Asthma']
data = data[data['disease'] != 'LRTI']

groups = data.groupby(['disease'])

for name, group in groups:
    print(name)
    print(group.shape[0])
    print('\n')


In [None]:
groups = data.groupby(['crackles', 'weezels', 'disease'])
for name, group in groups:
    print(name)
    print(group.shape[0])

In [None]:
groups = data.groupby(['crackles', 'weezels', 'disease'])

data_clean = []
for name, group in groups:
    if (group.shape[0] < 20):
        pass
    elif (group.shape[0] < 500):
        data_clean.append(group)
    else:
        data_clean.append(group.head(500))

data = pd.concat(data_clean)

groups = data.groupby(['crackles', 'weezels', 'disease'])
for name, group in groups:
    print(name)
    print(group.shape[0])

data.reset_index(inplace = True)

In [None]:
data = data.drop(['index'], axis = 1)
data.head()

In [None]:
data.shape[0]

In [None]:
data.to_csv("/kaggle/working/data_info_formatted.csv", index = True)

In [None]:
mean_length = (data['end'] - data['start']).mean()
mean_length

In [None]:
import librosa

time_domain_audio = []

for index, row in data.iterrows():
    audio, sr = librosa.load(path = path + row['filename'] + '.wav', sr = 2048, offset = row['start'], duration = row['end'] - row['start'])
    
    if len(audio) >= mean_length * sr:
        audio = audio[0:int(mean_length * sr)]
    else:
        audio = librosa.util.pad_center(audio, int(mean_length * sr))
    
    time_domain_audio.append(audio)
    print(index)
    
        

In [None]:
time_domain_audio = np.array(time_domain_audio)
time_domain_audio.shape

In [None]:
import h5py

f1 = h5py.File('/kaggle/working/time_domain_data.hdf5','w')
td_dataset = f1.create_dataset('data', dtype = 'float64', data = time_domain_audio)

In [None]:
f2 = h5py.File('/kaggle/working/time_domain_data.hdf5','r')
time_domain_data_rest = f2['data']
time_domain_data_rest = time_domain_data_rest[:]
np.array_equal(time_domain_audio, time_domain_data_rest)

In [None]:
import matplotlib.pyplot as plt

sample = time_domain_audio[343,:]
t = np.linspace(0, sample.size/sr, sample.size)

plt.figure(figsize = (20,5))
plt.plot(t, sample)
plt.show()

In [None]:
import scipy as sc

z = sc.fft.fft(sample)
z = np.abs(z[0:512])
z = np.concatenate((z[::-1], z))
f = np.linspace(-sr/4, sr/4, z.size)

plt.figure(figsize = (20,5))
plt.plot(f, z*2/sample.size)
plt.show()

In [None]:
stft = np.abs(librosa.stft(sample, n_fft = 256, hop_length = 128, win_length = 256))
stft = 20*np.log(np.abs(stft) + 1e-10)
stft.shape

In [None]:
from matplotlib.colors import BoundaryNorm
from matplotlib.ticker import MaxNLocator

t = np.linspace(0, mean_length, stft.shape[1])
f = np.linspace(0, sr/2, stft.shape[0])
levels = MaxNLocator(nbins = 15).tick_values(stft.min(), stft.max())

cmap = plt.get_cmap('YlOrRd')
norm = BoundaryNorm(levels, ncolors = cmap.N, clip = True)

fig, ax = plt.subplots(nrows = 1, figsize = (20, 5))

im = ax.pcolormesh(t, f, stft, cmap=cmap, norm=norm)
fig.colorbar(im, ax=ax)
ax.set_title('STFT Spectrogram')
plt.show()

In [None]:
stft_dataset = []

for i in range(time_domain_audio.shape[0]):
    sample_audio = time_domain_audio[i, :]
    stft = np.abs(librosa.stft(sample_audio, n_fft = 256, hop_length = 128, win_length = 256))
    stft = 20*np.log(np.abs(stft) + 1e-10)
    stft_dataset.append(stft)
    
    

In [None]:
stft_dataset = np.array(stft_dataset)
stft_dataset.shape

In [None]:
f3 = h5py.File('/kaggle/working/stft_dataset.hdf5','w')
stft_data = f3.create_dataset('data_stft', dtype = 'float64', data = stft_dataset)

In [None]:
np.save('/kaggle/working/time_domain.npy', time_domain_audio)

In [None]:
np.save('/kaggle/working/stft_dataset.npy', stft_dataset)