In [None]:
# Utils
import pandas as pd
import numpy as np
import json
import os
import subprocess
import zipfile
import IPython.display as ipd
from pathlib import Path
from collections import Counter

# Library for audio processing
import librosa
import soundfile as sf

# Visualizations and plots
import matplotlib.pyplot as plt

# Options
pd.set_option("display.max_columns", None)
pd.options.mode.chained_assignment = None

# Colab
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data Loading

The dataset is available within a compressed file. I proceed to extract the data and save the audio tracks and metadata in the **CoughVID_Dataset** folder. In particular, the data is contained in the folder ***coughvid_20211012***.

In [None]:
# with zipfile.ZipFile('/content/gdrive/MyDrive/Colab Notebooks/DSIM Project/public_dataset_v3.zip', 'r') as zip_ref:
#     zip_ref.extractall('/content/gdrive/MyDrive/Colab Notebooks/DSIM Project/Dataset')

In [None]:
# Check if all the files were correctly extracted
print(len(os.listdir('/content/gdrive/MyDrive/Colab Notebooks/DSIM Project/Dataset/coughvid_20211012')))

68869


The folder contains the correct number of files. The extraction process has completed without any issues. The folder contains both the audio tracks (with .wav, .webm and .ogg extensions) and the metadata (a .json file per audio track).

The files are divided in multiple subfolders in `0_MovingFiles.ipynb`



# Dataset Cleaning

The dataset contains 34434 audio tracks. However, some of this are not cough tracks. In the metadata dataframe there is an attribute, cough_detected, which indicates the probability that the corresponding audio track contains coughs. This attribute has been calculated by the authors of the dataset, using a specific classifier. As suggested by the authors themselves, it is appropriate to only consider tracks with cough_detected > 0.8, which should leave less than 5% of non-cough tracks.

In [None]:
metadata = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/DSIM Project/Metadata/metadata_original.csv')
metadata = metadata.drop(['Unnamed: 0'], axis = 1)
metadata.head()

Unnamed: 0,uuid,datetime,cough_detected,latitude,longitude,age,gender,respiratory_condition,fever_muscle_pain,status,status_SSL,quality_1,cough_type_1,dyspnea_1,wheezing_1,stridor_1,choking_1,congestion_1,nothing_1,diagnosis_1,severity_1,quality_2,cough_type_2,dyspnea_2,wheezing_2,stridor_2,choking_2,congestion_2,nothing_2,diagnosis_2,severity_2,quality_3,cough_type_3,dyspnea_3,wheezing_3,stridor_3,choking_3,congestion_3,nothing_3,diagnosis_3,severity_3,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
0,00014dcc-0f06-4c27-8c7b-737b18a2cf4c,2020-11-25T18:58:50.488301+00:00,0.0155,48.9,2.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,00039425-7f3a-42aa-ac13-834aaa2b6b92,2020-04-13T21:30:59.801831+00:00,0.9609,31.3,34.8,15.0,male,False,False,healthy,healthy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,2020-10-18T15:38:38.205870+00:00,0.1643,,,46.0,female,False,False,healthy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,00098cdb-4da1-4aa7-825a-4f1b9abc214b,2021-01-22T22:08:06.742577+00:00,0.1133,47.4,9.4,66.0,female,False,False,healthy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,2020-04-12T04:02:18.159383+00:00,0.9301,40.0,-75.1,34.0,male,True,False,healthy,healthy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
print(metadata.shape)
print(metadata.columns)

(34434, 51)
Index(['uuid', 'datetime', 'cough_detected', 'latitude', 'longitude', 'age',
       'gender', 'respiratory_condition', 'fever_muscle_pain', 'status',
       'status_SSL', 'quality_1', 'cough_type_1', 'dyspnea_1', 'wheezing_1',
       'stridor_1', 'choking_1', 'congestion_1', 'nothing_1', 'diagnosis_1',
       'severity_1', 'quality_2', 'cough_type_2', 'dyspnea_2', 'wheezing_2',
       'stridor_2', 'choking_2', 'congestion_2', 'nothing_2', 'diagnosis_2',
       'severity_2', 'quality_3', 'cough_type_3', 'dyspnea_3', 'wheezing_3',
       'stridor_3', 'choking_3', 'congestion_3', 'nothing_3', 'diagnosis_3',
       'severity_3', 'quality_4', 'cough_type_4', 'dyspnea_4', 'wheezing_4',
       'stridor_4', 'choking_4', 'congestion_4', 'nothing_4', 'diagnosis_4',
       'severity_4'],
      dtype='object')


Some of the attributes in the dataset are related to annotations made by experts regarding possible pathologies or features deducible from the audio track. Only 1000 audio tracks have been annotated, so we decide not to consider such attributes. Among the other attributes, we are interested in `status` and `cough_detected`.

In [None]:
metadata = metadata.drop(['datetime', 'latitude', 'longitude', 'status_SSL', 'quality_1', 'cough_type_1', 'dyspnea_1', 
                                  'wheezing_1', 'stridor_1', 'choking_1', 'congestion_1', 'nothing_1', 'diagnosis_1', 'severity_1', 
                                  'quality_2', 'cough_type_2', 'dyspnea_2', 'wheezing_2', 'stridor_2', 'choking_2', 'congestion_2', 
                                  'nothing_2', 'diagnosis_2', 'severity_2', 'quality_3', 'cough_type_3', 'dyspnea_3', 'wheezing_3', 
                                  'stridor_3', 'choking_3', 'congestion_3', 'nothing_3', 'diagnosis_3', 'severity_3', 'quality_4', 
                                  'cough_type_4', 'dyspnea_4', 'wheezing_4', 'stridor_4', 'choking_4', 'congestion_4', 'nothing_4', 
                                  'diagnosis_4', 'severity_4'], axis = 1)

In [None]:
metadata.isna().sum()

uuid                         0
cough_detected               0
age                      15038
gender                   13770
respiratory_condition    13770
fever_muscle_pain        13770
status                   13770
dtype: int64

We can notice the presence of tracks with an unknown value for the `status` attribute. Since our goal is to perform a supervised learning task, we are going to ignore such records.

In [None]:
metadata = metadata.dropna(subset=['status'])
metadata.shape

(20664, 7)

We also decide to remove the records with missing values for the age attribute, in case we want to use it for classification.

In [None]:
metadata = metadata.dropna(subset=['age'])
metadata.shape

(19396, 7)

At this point, as mentioned before, we are going to remove the records with cough_detected < 0.8.

In [None]:
cough_tracks = metadata[metadata['cough_detected'] >= 0.8]
cough_tracks.shape

(12887, 7)

Only 12887 audio tracks are kept, which, according to the creators of the dataset, contain cough sounds for over 95%.

The audio tracks are distributed in this way:

In [None]:
cough_tracks.groupby('status').size()

status
COVID-19        634
healthy        9631
symptomatic    2622
dtype: int64

In [None]:
# Saving the dataframe
with open('/content/gdrive/MyDrive/Colab Notebooks/DSIM Project/Metadata/metadata_coughtracks.csv', 'w', encoding = 'utf-8-sig') as f:
  cough_tracks.to_csv(f)