# Prepare Data for Training

In [14]:
# system imports
import os
import glob

# additional imports
import pandas as pd
from tqdm.auto import tqdm
from sklearn.utils import resample

# internal imports
from utils import preproces

## Kaggle Dataset - Exploration

In [6]:
# download and extract the dataset from kaggle into the ./data/kaggle folder!
# https://www.kaggle.com/himanshu007121/coughclassifier-trial/download

fn_dataset = 'data/kaggle/cough_trial_extended.csv'
df_dataset = pd.read_csv(fn_dataset)

print('Total number of examples:', len(df_dataset))
print('Number of positive examples:', len(df_dataset[df_dataset['class'] == 'covid']))
print('Number of negative examples:', len(df_dataset[df_dataset['class'] == 'not_covid']))

df_dataset.head()

Total number of examples: 170
Number of positive examples: 19
Number of negative examples: 151


Unnamed: 0,file_properties,class
0,0v8MGxNetjg_ 10.000_ 20.000.wav,not_covid
1,1j1duoxdxBg_ 70.000_ 80.000.wav,not_covid
2,1MSYO4wgiag_ 120.000_ 130.000.wav,not_covid
3,1PajbAKd8Kg_ 0.000_ 10.000.wav,not_covid
4,cov1.wav,covid


## Kaggle Dataset - Feature Extraction

In [9]:
df_features_cols = ['filename', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate']
for i in range(1, 21):
    df_features_cols.append(f'mfcc{i}')
df_features_cols.append('label')

df_features = pd.DataFrame(columns=df_features_cols)

for row_index, row in tqdm(df_dataset.iterrows(), total=len(df_dataset)):
    fn_wav = os.path.join('data/kaggle/trial_covid/', row['file_properties'])
    feature_row = preproces(fn_wav)
    feature_row['filename'] = row['file_properties']
    feature_row['label'] = row['class']
    df_features = df_features.append(feature_row, ignore_index=True)

df_features.to_csv('data/prepared_data_kaggle.csv', index=False, columns=df_features_cols)

df_features.head()

100%|██████████| 170/170 [00:17<00:00,  9.82it/s]


Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,0v8MGxNetjg_ 10.000_ 20.000.wav,0.519951,0.045853,1612.895795,1411.838677,2907.580566,0.107019,-376.876007,111.017372,-31.904015,...,-7.439712,-1.03458,-0.203084,-3.513495,-1.745705,-3.011878,-2.878482,-2.106427,-4.026825,not_covid
1,1j1duoxdxBg_ 70.000_ 80.000.wav,0.535472,0.001771,2892.087076,2467.408141,5072.664388,0.148584,-519.158447,60.781284,-13.722886,...,-0.909972,7.216461,-1.71963,3.903021,3.653039,3.043882,2.439957,2.781968,2.195162,not_covid
2,1MSYO4wgiag_ 120.000_ 130.000.wav,0.496666,0.033657,3429.061935,2788.634413,6886.288452,0.225315,-282.297913,48.58168,-15.522366,...,-6.066336,-4.16764,1.017302,-0.523806,0.538693,-8.855953,-2.927977,-1.118562,-5.906228,not_covid
3,1PajbAKd8Kg_ 0.000_ 10.000.wav,0.407549,0.013452,2710.811637,2664.28755,5778.474935,0.142076,-346.8573,75.765617,-7.648193,...,5.053118,-0.291308,0.987186,-2.447526,3.692367,2.312328,-2.059656,-4.772599,-0.503851,not_covid
4,cov1.wav,0.412697,0.059004,1555.648634,1418.599932,2870.737092,0.133998,-340.588013,104.1567,-32.228443,...,-8.247169,0.940006,-5.701087,-6.32663,-1.08004,-1.812609,-2.518986,-3.684266,-3.564146,covid


## Virufy Dataset - Exploration

In [10]:
# download and extract the segmented folder of the virufy dataset into the ./data/virufy folder!
# https://github.com/virufy/virufy_data/tree/main/clinical/segmented

df_dataset = pd.DataFrame(columns=['file_properties', 'class'])
for fn in glob.glob('data/virufy/pos/*.mp3'):
    df_dataset = df_dataset.append({'file_properties': fn, 'class': 'covid'}, ignore_index=True)
for fn in glob.glob('data/virufy/neg/*.mp3'):
    df_dataset = df_dataset.append({'file_properties': fn, 'class': 'not_covid'}, ignore_index=True)

print('Total number of examples:', len(df_dataset))
print('Number of positive examples:', len(df_dataset[df_dataset['class'] == 'covid']))
print('Number of negative examples:', len(df_dataset[df_dataset['class'] == 'not_covid']))

df_dataset.head()

Total number of examples: 121
Number of positive examples: 48
Number of negative examples: 73


Unnamed: 0,file_properties,class
0,data/virufy/pos\pos-0421-084-cough-m-50-0.mp3,covid
1,data/virufy/pos\pos-0421-084-cough-m-50-1.mp3,covid
2,data/virufy/pos\pos-0421-084-cough-m-50-2.mp3,covid
3,data/virufy/pos\pos-0421-084-cough-m-50-3.mp3,covid
4,data/virufy/pos\pos-0421-084-cough-m-50-4.mp3,covid


## Virufy Dataset - Feature Extraction

In [11]:
df_features_cols = ['filename', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate']
for i in range(1, 21):
    df_features_cols.append(f'mfcc{i}')
df_features_cols.append('label')

df_features = pd.DataFrame(columns=df_features_cols)

for row_index, row in tqdm(df_dataset.iterrows(), total=len(df_dataset)):
    fn_wav = row['file_properties']
    feature_row = preproces(fn_wav)
    feature_row['filename'] = row['file_properties']
    feature_row['label'] = row['class']
    df_features = df_features.append(feature_row, ignore_index=True)

df_features.to_csv('data/prepared_data_virufy.csv', index=False, columns=df_features_cols)

df_features.head()

100%|██████████| 121/121 [00:30<00:00,  4.00it/s]


Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,data/virufy/pos\pos-0421-084-cough-m-50-0.mp3,0.248572,0.03816,928.803303,985.3944,1928.469981,0.049996,-452.175018,51.507637,-2.926436,...,-8.539044,-4.274072,-6.053506,4.326995,-2.804232,-8.568246,-4.783424,-4.805704,-6.343561,covid
1,data/virufy/pos\pos-0421-084-cough-m-50-1.mp3,0.27289,0.042899,1198.073413,1228.456213,2544.662874,0.058636,-411.897034,50.288296,-10.50984,...,-10.116323,-2.845035,-6.256668,1.597882,-6.182134,-7.652954,-4.851939,-4.44581,-5.718525,covid
2,data/virufy/pos\pos-0421-084-cough-m-50-2.mp3,0.211356,0.045217,779.386249,813.789555,1617.642875,0.039905,-419.055237,54.78249,-12.589089,...,-10.108368,-1.575916,-6.721029,0.88909,-6.61612,-5.551765,-3.358707,-4.357968,-5.224887,covid
3,data/virufy/pos\pos-0421-084-cough-m-50-3.mp3,0.274176,0.050415,1166.375454,1128.14004,2349.459706,0.062196,-381.292816,61.291149,-18.051516,...,-12.777524,-0.523028,-8.939261,1.442178,-4.029651,-8.327115,-5.125196,-6.032467,-5.149289,covid
4,data/virufy/pos\pos-0421-084-cough-m-50-4.mp3,0.259742,0.045859,1104.65973,1145.806281,2281.427267,0.054963,-399.710876,61.209465,-15.755272,...,-11.284485,-0.295882,-8.218568,2.517593,-1.999434,-7.744737,-3.366359,-5.900706,-7.301231,covid


## Combine Datasets

In [16]:
df_features_kaggle = pd.read_csv('data/prepared_data_kaggle.csv')
df_features_virufy = pd.read_csv('data/prepared_data_virufy.csv')
df_features = pd.concat([df_features_kaggle, df_features_virufy])

df_features.to_csv('data/prepared_data.csv', index=False, columns=df_features_cols)

print('Total number of examples:', len(df_features))
print('Number of positive examples:', len(df_features[df_features['label'] == 'covid']))
print('Number of negative examples:', len(df_features[df_features['label'] == 'not_covid']))

Total number of examples: 291
Number of positive examples: 67
Number of negative examples: 224


## Balanced Dataset

In [19]:
df_features = pd.read_csv('data/prepared_data.csv')

# Separate majority and minority classes
df_majority = df_features[df_features['label'] == 'not_covid']
df_minority = df_features[df_features['label'] == 'covid']
 
# Downsample majority class
df_majority_balanced = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)
 
# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_balanced, df_minority])

df_balanced.to_csv('data/prepared_data_balanced.csv', index=False)

print('Total number of examples:', len(df_balanced))
print('Number of positive examples:', len(df_balanced[df_balanced['label'] == 'covid']))
print('Number of negative examples:', len(df_balanced[df_balanced['label'] == 'not_covid']))

Total number of examples: 134
Number of positive examples: 67
Number of negative examples: 67
