In [1]:
import os
import pandas as pd
import pickle
import numpy as np
from IPython.display import clear_output

### adding to PATH environment variable
To get librosa and other audio libraries to run. dont know why but jupyter does not load the necessary environment pth variables by itself

In [2]:
path = os.environ.get("PATH")
additional_path = 'C:\\Users\\Michi\\Anaconda3\\envs\\python_v3-8;C:\\Users\\Michi\\Anaconda3\\envs\\python_v3-8\\Library\\mingw-w64\\bin;C:\\Users\\Michi\\Anaconda3\\envs\\python_v3-8\\Library\\usr\\bin;C:\\Users\\Michi\\Anaconda3\\envs\\python_v3-8\\Library\\bin;C:\\Users\\Michi\\Anaconda3\\envs\\python_v3-8\\Scripts;C:\\Users\\Michi\\Anaconda3\\envs\\python_v3-8\\bin;C:\\Users\\Michi\\Anaconda3\\condabin'
min_additional_path = "C:\\Users\\Michi\\Anaconda3\\envs\\python_v3-8\\Library\\bin;C:\\Users\\micha\\anaconda3\\envs\\ai38\\Library\\bin;"
combined_path = min_additional_path + path
os.environ["PATH"] = combined_path
import librosa

In [3]:
def rename_columns(metadata_df):
    meta_data_labels = {
        "id": "user_id",
        "a": "age",
        "covid_status": "covid_health_status",
        "record_date": "record_date",
        "ep": "english_proficiency",
        "g": "gender",
        "l_c": "country",
        "l_l": "local_region",
        "l_s": "state",
        "rU": "returning_user",
        "asthma": "asthma",
        "cough": "cough",
        "smoker": "smoker",
        "test_status": "covid_test_result",
        "ht": "hypertension",
        "cold": "cold",
        "diabetes": "diabetes",
        "diarrhoea": "diarrheoa",
        "um": "was_using_mask",
        "ihd": "ischemic_heart_disease",
        "bd": "breathing_difficulties",
        "st": "sore_throat",
        "fever": "fever",
        "ftg": "fatigue",
        "mp": "muscle_pain",
        "loss_of_smell": "loss_of_smell",
        "cld": "chronic_lung_disease",
        "pneumonia": "pneumonia",
        "ctScan": "has_taken_ct_scan",
        "testType": "type_of_covid_test",
        "test_date": "covid_test_date",
        "vacc": "vaccination_status",  # (y->both doses, p->one dose(partially vaccinated), n->no doses)
        "ctDate": "date_of_ct_scan",
        "ctScore": "ct_score",
        "others_resp": "other_respiratory_illness",
        "others_preexist": "other_preexisting_condition"
    }
    metadata_df.rename(meta_data_labels, axis="columns", inplace=True)


In [4]:
def create_labels(metadata_df, verbose=False):
    # 0 meaning healthy, 1 meaning covid-infection
    NEGATIVE_LABELS = ["healthy", "resp_illness_not_identified", "no_resp_illness_exposed"]
    POSITIVE_LABELS = ["positive_mild", "positive_moderate", "positive_asymp"]
    UNKNOWN_LABELS =  ["under_validation", "recovered_full"]

    metadata_df["covid_label"] = np.nan
    negative_idx = metadata_df["covid_health_status"].str.contains("|".join(NEGATIVE_LABELS))
    positive_idx = metadata_df["covid_health_status"].str.contains("|".join(POSITIVE_LABELS))
    metadata_df.loc[negative_idx, "covid_label"] = 0
    metadata_df.loc[positive_idx, "covid_label"] = 1
    if verbose:
        print(metadata_df["covid_label"].value_counts())

In [5]:
def get_audio_quality_annotations():
    PATH = "data/Coswara-Data/annotations/"
    recording_quality_files = {"audio_quality_deep_breathing":  PATH + "breathing-deep_labels_pravinm.csv",
                               "audio_quality_shallow_breathing": PATH + "breathing-shallow_labels_pravinm.csv",
                               "audio_quality_heavy_cough": PATH + "cough-heavy_labels_debottam.csv",
                               "audio_quality_shallow_cough": PATH + "cough-shallow_labels_debarpan.csv",
                               "audio_quality_counting_fast": PATH + "counting-fast_labels_pravinm.csv",
                               "audio_quality_counting_normal": PATH + "counting-normal_labels_pravinm.csv",
                               "audio_quality_vowel_a": PATH + "vowel-a_labels_debarpan.csv",
                               "audio_quality_vowel_e": PATH + "vowel-e_labels_debottam.csv",
                               "audio_quality_vowel_o": PATH + "vowel-o_labels_updated_neeraj.csv"}

    df = pd.DataFrame({"user_id":participant_ids})
    for (feature_name, file_path) in recording_quality_files.items():
        audio_quality_annotations = pd.read_csv(file_path)

        rename_dict = {"FILENAME": "user_id",
                       " QUALITY": feature_name}
        
        audio_quality_annotations.rename(rename_dict, axis="columns", inplace=True)
        audio_quality_annotations.user_id = audio_quality_annotations.user_id.str.split("_").str[0]
        df = pd.merge(df, audio_quality_annotations, on="user_id", how="outer")
    return df

In [6]:
def get_recording_durations(id_participant, recordings_path="data/Coswara_processed/Recordings"):
    # returns original duration and duration after trimming trailing/leading silence for each recording type as a dict
    # returns 0 for both if an error occured/the audio is invalid
    recording_types = ["cough-heavy", "cough-shallow", "breathing-deep", "breathing-shallow", "counting-fast",
                       "counting-normal", "vowel-a", "vowel-e", "vowel-o"]
    path = os.path.join(recordings_path, id_participant)
    duration_original = {}
    duration_trimmed = {}
    
    for rec_type in recording_types:
        file_path = f"{os.path.join(path, rec_type)}.wav"
        try:
            audio, sample_rate = librosa.load(file_path, sr=None)
        except FileNotFoundError:
            duration_original[rec_type], duration_trimmed[rec_type] = 0, 0
        if len(audio) == 0:            
            duration_original[rec_type], duration_trimmed[rec_type] = 0, 0
        elif max(audio) == 0:
            duration_original[rec_type], duration_trimmed[rec_type] = 0, 0
        else:
            duration_original[rec_type] = len(audio)/sample_rate
            audio, _ = librosa.effects.trim(audio, top_db=54)
            duration_trimmed[rec_type] = len(audio)/sample_rate
            
        
    return duration_original, duration_trimmed

In [7]:
def get_recording_duration_df(participant_ids, LOAD_FROM_DISC=True):
    from time import sleep
    recording_types = ["cough-heavy", "cough-shallow", "breathing-deep", "breathing-shallow", "counting-fast",
                       "counting-normal", "vowel-a", "vowel-e", "vowel-o"]
    if LOAD_FROM_DISC:
        try:
            audio_recording_metadata = pd.read_csv("data/Coswara_processed/duration_df.csv")
            print("loaded audio_recording_metadata dataframe from disk!")
        except FileNotFoundError: 
            LOAD_FROM_DISC=False
            print("File not found, computing durations anew")
            sleep(3)
            
    if not LOAD_FROM_DISC:
        durations_original, durations_trimmed = [], []
        for idx, participant in enumerate(participant_ids):
            clear_output(wait=True)
            print(f"{idx+1} / {len(participant_ids)}")
            original, trimmed = get_recording_durations(participant)
            durations_original.append(original), durations_trimmed.append(trimmed)

        all_durations_orig = {}
        all_durations_trim = {}
        for rec_type in recording_types:
            all_durations_orig[f"duration_original_{rec_type}"] = [round(recording[rec_type],3) for recording in durations_original]
            all_durations_trim[f"duration_trimmed_{rec_type}"] = [round(recording[rec_type],3) for recording in durations_trimmed]

        duration_dict = {"user_id":participant_ids}
        duration_dict.update(all_durations_orig)
        duration_dict.update(all_durations_trim)
        audio_recording_metadata = pd.DataFrame(duration_dict)

        audio_recording_metadata.to_csv("data/Coswara_processed/duration_df.csv", index=False)
    return audio_recording_metadata

In [8]:
def get_invalid_recordings(audio_recording_metadata, recording_types_used=
                           ["cough-heavy", "cough-shallow", "breathing-deep", "breathing-shallow", 
                            "counting-fast","counting-normal", "vowel-a", "vowel-e", "vowel-o"]):
    
    recording_is_invalid=[]
    for rec_type in recording_types_used:
        recording_is_invalid.append(audio_recording_metadata[f"duration_original_{rec_type}"] == 0)

    n_invalid_recordings = np.array(recording_is_invalid).sum(axis=0)
    contains_invlaid_recording = n_invalid_recordings > 0
    contains_invalid_recording_df = pd.DataFrame({"user_id": participant_ids, 
                                                  "recording_invalid": contains_invlaid_recording, 
                                                  "n_invalid_recordings": n_invalid_recordings})
    return contains_invalid_recording_df

### get into the correct root directory

In [9]:
root_dir = "python"
_, current_folder = os.path.split(os.getcwd())
if current_folder != root_dir:
    os.chdir("../")

### Load original meta data file
Make sure that the "combined matadata" file was copied in this directory from the original coswara folder and renamed tto "original_metadata.csv"

In [10]:
metadata = pd.read_csv("data/Coswara_processed/original_metadata.csv")
participant_ids = os.listdir("data/Coswara_processed/Recordings/")
metadata.head(3)

Unnamed: 0,id,a,covid_status,record_date,ep,g,l_c,l_l,l_s,rU,...,vacc,bd,others_resp,ftg,st,ihd,asthma,others_preexist,cld,pneumonia
0,iV3Db6t1T8b7c5HQY2TwxIhjbzD3,28,healthy,2020-04-23,y,male,India,Anantapur,Andhra Pradesh,n,...,,,,,,,,,,
1,AxuYWBN0jFVLINCBqIW5aZmGCdu1,25,healthy,2020-04-20,y,male,India,BENGALURU URBAN,Karnataka,n,...,,,,,,,,,,
2,C5eIsssb9GSkaAgIfsHMHeR6fSh1,28,healthy,2020-04-24,y,female,United States,Pittsburgh,Pennsylvania,n,...,,,,,,,,,,


### Rename the columns of the Dataframe
The previous names were uninterpretable without additional explanations

In [11]:
rename_columns(metadata)
metadata.head(3)

Unnamed: 0,user_id,age,covid_health_status,record_date,english_proficiency,gender,country,local_region,state,returning_user,...,vaccination_status,breathing_difficulties,other_respiratory_illness,fatigue,sore_throat,ischemic_heart_disease,asthma,other_preexisting_condition,chronic_lung_disease,pneumonia
0,iV3Db6t1T8b7c5HQY2TwxIhjbzD3,28,healthy,2020-04-23,y,male,India,Anantapur,Andhra Pradesh,n,...,,,,,,,,,,
1,AxuYWBN0jFVLINCBqIW5aZmGCdu1,25,healthy,2020-04-20,y,male,India,BENGALURU URBAN,Karnataka,n,...,,,,,,,,,,
2,C5eIsssb9GSkaAgIfsHMHeR6fSh1,28,healthy,2020-04-24,y,female,United States,Pittsburgh,Pennsylvania,n,...,,,,,,,,,,


### Adding Audio Quality (0|1|2) Columns for each Type of Recording
2(excellent), 1(good), 0(bad)

In [12]:
audio_quality_annotations = get_audio_quality_annotations()
audio_quality_annotations.head(3)

Unnamed: 0,user_id,audio_quality_deep_breathing,audio_quality_shallow_breathing,audio_quality_heavy_cough,audio_quality_shallow_cough,audio_quality_counting_fast,audio_quality_counting_normal,audio_quality_vowel_a,audio_quality_vowel_e,audio_quality_vowel_o
0,00xKcQMmcAhX8CODgBBLOe7Dm0T2,2,2,2.0,2.0,2.0,2.0,2.0,2,2.0
1,01GtHP1FUbXKdWEUwApFdusuO773,0,0,,,,,,1,
2,01n0u9YSkXQSkmN45J65eigyCMC3,0,1,2.0,1.0,2.0,2.0,2.0,2,2.0


In [13]:
# qudio quality distribution of two columns
audio_quality_annotations.audio_quality_heavy_cough.value_counts(), audio_quality_annotations.audio_quality_shallow_breathing.value_counts()

(2.0    1927
 1.0     167
 0.0     139
 Name: audio_quality_heavy_cough, dtype: int64,
 2    1906
 0     472
 1     368
 Name: audio_quality_shallow_breathing, dtype: int64)

# Create binary values from various subcategories
A participant is allowed to choose only one of the below labels.

['covid_status']=='positive_asymp': Infected with COVID-19 and has no Covid like symptoms

['covid_status']=='positive_mild': Infected with COVID-19 and has mild Covid like symptoms

['covid_status']=='positive_moderate': Infected with COVID-19 and has moderate Covid like symptoms

['covid_status']=='recovered_full': Was infected with COVID-19 and has recovered during data collection.

['covid_status']=='resp_illness_not_identified': Not infected by COVID-19 but has some other respiratory illness

['covid_status']=='no_resp_illness_exposed': Not infected by COVID-19, has no respiratory illness, but has been exposed to SARS-CoV-2 virus. Example, a healthcare worker or somebody staying with a COVID positive patient.

['covid_status']=='healthy': Not infected by COVID-19 and none of the above.

['covid_status']=='under validation ': It is for our internal use related to clinical validation of the Coswara web application.

In [14]:
metadata.covid_health_status.value_counts()

healthy                        1433
positive_mild                   426
no_resp_illness_exposed         248
positive_moderate               165
resp_illness_not_identified     157
recovered_full                  146
positive_asymp                   90
under_validation                 81
Name: covid_health_status, dtype: int64

In [15]:
create_labels(metadata, verbose=True)

0.0    1838
1.0     681
Name: covid_label, dtype: int64


In [16]:
recording_metadata_df = get_recording_duration_df(participant_ids, True)
recording_metadata_df.head()

loaded audio_recording_metadata dataframe from disk!


Unnamed: 0.1,Unnamed: 0,user_id,duration_original_cough-heavy,duration_original_cough-shallow,duration_original_breathing-deep,duration_original_breathing-shallow,duration_original_counting-fast,duration_original_counting-normal,duration_original_vowel-a,duration_original_vowel-e,duration_original_vowel-o,duration_trimmed_cough-heavy,duration_trimmed_cough-shallow,duration_trimmed_breathing-deep,duration_trimmed_breathing-shallow,duration_trimmed_counting-fast,duration_trimmed_counting-normal,duration_trimmed_vowel-a,duration_trimmed_vowel-e,duration_trimmed_vowel-o
0,0,00xKcQMmcAhX8CODgBBLOe7Dm0T2,8.192,2.816,19.968,8.448,8.704,14.251,11.093,14.677,12.971,8.171,2.752,19.968,8.448,8.683,14.251,10.997,14.635,12.949
1,1,01GtHP1FUbXKdWEUwApFdusuO773,3.669,2.56,14.763,8.533,5.291,11.605,2.475,2.56,1.536,3.669,2.56,14.763,8.533,5.291,11.605,2.475,2.56,1.536
2,2,01n0u9YSkXQSkmN45J65eigyCMC3,4.523,3.413,13.056,15.019,7.339,24.149,6.315,4.352,3.669,4.235,3.221,13.056,15.019,7.04,24.149,5.568,4.107,3.584
3,3,01OCEf1yB4czsq8ygRoT51s96Ba2,6.229,5.632,13.739,8.619,9.387,15.019,9.131,11.435,12.629,4.576,3.573,12.096,7.051,9.205,14.976,9.12,11.435,11.733
4,4,03TmwzsdEBVEh35MRMbC9d0NnfI3,3.157,3.243,15.701,3.499,3.328,8.021,1.451,1.024,1.536,2.421,2.837,15.616,3.36,3.296,7.435,0.587,0.491,1.099


# Get invalid recordings from durations
All recordings that were set to duration = 0  are invalid
output contains a bool column that is True if at least one recording was invalid and a column that has the count of invalid recordings of the participant
You can specify the recordings, that you are going to use. If you only use "cough" recordings for example, the participants that have valid cough recordings but invalid breathing recordings are classified as "valid"

In [17]:
invalid_recordings_df = get_invalid_recordings(recording_metadata_df)
print(invalid_recordings_df.recording_invalid.value_counts())
print(invalid_recordings_df.n_invalid_recordings.value_counts())
invalid_recordings_df.head()

False    2582
True      164
Name: recording_invalid, dtype: int64
0    2582
9      84
1      45
2      11
8       8
4       5
5       4
7       3
3       3
6       1
Name: n_invalid_recordings, dtype: int64


Unnamed: 0,user_id,recording_invalid,n_invalid_recordings
0,00xKcQMmcAhX8CODgBBLOe7Dm0T2,False,0
1,01GtHP1FUbXKdWEUwApFdusuO773,False,0
2,01n0u9YSkXQSkmN45J65eigyCMC3,False,0
3,01OCEf1yB4czsq8ygRoT51s96Ba2,False,0
4,03TmwzsdEBVEh35MRMbC9d0NnfI3,False,0


# Merge all partial meta data dataframes

In [18]:
full_meta_data = pd.merge(metadata, audio_quality_annotations, on="user_id", how="outer")
full_meta_data = pd.merge(full_meta_data, recording_metadata_df, on="user_id", how="outer")
full_meta_data = pd.merge(full_meta_data, invalid_recordings_df, on="user_id", how="outer")

full_meta_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2746 entries, 0 to 2745
Data columns (total 67 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   user_id                              2746 non-null   object 
 1   age                                  2746 non-null   int64  
 2   covid_health_status                  2746 non-null   object 
 3   record_date                          2746 non-null   object 
 4   english_proficiency                  2746 non-null   object 
 5   gender                               2746 non-null   object 
 6   country                              2746 non-null   object 
 7   local_region                         2425 non-null   object 
 8   state                                2746 non-null   object 
 9   returning_user                       2066 non-null   object 
 10  smoker                               1086 non-null   object 
 11  cold                          

In [23]:
full_meta_data.to_csv("data/Coswara_processed/full_meta_data.csv", index=False)
metadata.to_csv("data/Coswara_processed/reformatted_metadata.csv", index=False)
invalid_recordings_df.to_csv("data/Coswara_processed/invalid_recordings.csv", index=False)
recording_metadata_df.to_csv("data/Coswara_processed/duration_df.csv", index=False)
audio_quality_annotations.to_csv("data/Coswara_processed/audio_quality_annotations_df.csv", index=False)
