# Prepare Data for Training

In [1]:
# system imports
import os

# additional imports
import pandas as pd
from tqdm.auto import tqdm

# internal imports
from utils import preproces

## Download and Extract Dataset

In [2]:
# download and extract the dataset from kaggle into the ./data/ folder!
# https://www.kaggle.com/himanshu007121/coughclassifier-trial/download

fn_dataset = "data/cough_trial_extended.csv"
df_dataset = pd.read_csv(fn_dataset)
df_dataset.head()

Unnamed: 0,file_properties,class
0,0v8MGxNetjg_ 10.000_ 20.000.wav,not_covid
1,1j1duoxdxBg_ 70.000_ 80.000.wav,not_covid
2,1MSYO4wgiag_ 120.000_ 130.000.wav,not_covid
3,1PajbAKd8Kg_ 0.000_ 10.000.wav,not_covid
4,cov1.wav,covid


## Feature Extraction

In [3]:
df_features_cols = ["filename", "chroma_stft", "rmse", "spectral_centroid", "spectral_bandwidth", "rolloff", "zero_crossing_rate"]
for i in range(1, 21):
    df_features_cols.append(f'mfcc{i}')
df_features_cols.append('label')

df_features = pd.DataFrame(columns=df_features_cols)

for row_index, row in tqdm(df_dataset.iterrows(), total=len(df_dataset)):
    fn_wav = os.path.join("data/trial_covid/", row["file_properties"])
    feature_row = preproces(fn_wav)
    feature_row["filename"] = row["file_properties"]
    feature_row["label"] = row["class"]
    df_features = df_features.append(feature_row, ignore_index=True)

df_features.to_csv("data/prepared_data.csv", index=False, columns=df_features_cols)

100%|██████████| 170/170 [00:31<00:00,  5.38it/s]


In [4]:
df_features.head()

Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,0v8MGxNetjg_ 10.000_ 20.000.wav,0.519951,0.045853,1612.895795,1411.838677,2907.580566,0.107019,-376.876007,111.017372,-31.904015,...,-7.439712,-1.03458,-0.203084,-3.513495,-1.745705,-3.011878,-2.878482,-2.106427,-4.026825,not_covid
1,1j1duoxdxBg_ 70.000_ 80.000.wav,0.535472,0.001771,2892.087076,2467.408141,5072.664388,0.148584,-519.158447,60.781284,-13.722886,...,-0.909972,7.216461,-1.71963,3.903021,3.653039,3.043882,2.439957,2.781968,2.195162,not_covid
2,1MSYO4wgiag_ 120.000_ 130.000.wav,0.496666,0.033657,3429.061935,2788.634413,6886.288452,0.225315,-282.297913,48.58168,-15.522366,...,-6.066336,-4.16764,1.017302,-0.523806,0.538693,-8.855953,-2.927977,-1.118562,-5.906228,not_covid
3,1PajbAKd8Kg_ 0.000_ 10.000.wav,0.407549,0.013452,2710.811637,2664.28755,5778.474935,0.142076,-346.8573,75.765617,-7.648193,...,5.053118,-0.291308,0.987186,-2.447526,3.692367,2.312328,-2.059656,-4.772599,-0.503851,not_covid
4,cov1.wav,0.412697,0.059004,1555.648634,1418.599932,2870.737092,0.133998,-340.588013,104.1567,-32.228443,...,-8.247169,0.940006,-5.701087,-6.32663,-1.08004,-1.812609,-2.518986,-3.684266,-3.564146,covid
