In [1]:
!unzip -qq /content/drive/MyDrive/Data/covid_cough_database.zip

In [2]:
import os
import glob

import pandas as pd
from tqdm.auto import tqdm
from sklearn.utils import resample
import numpy as np

import librosa

In [3]:
# referenced
# https://www.kaggle.com/ashishpatel26/feature-extraction-from-audio

def preproces(fn_wav):
    y, sr = librosa.load(fn_wav, mono = True, duration = 5)
    chroma_stft = librosa.feature.chroma_stft(y = y, sr = sr)
    rmse = librosa.feature.rms(y = y)
    spectral_centroid = librosa.feature.spectral_centroid(y = y, sr = sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y = y, sr = sr)
    rolloff = librosa.feature.spectral_rolloff(y = y, sr = sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y = y, sr = sr)
    
    feature_row = {        
        'chroma_stft': np.mean(chroma_stft),
        'rmse': np.mean(rmse),
        'spectral_centroid': np.mean(spectral_centroid),
        'spectral_bandwidth': np.mean(spectral_bandwidth),
        'rolloff': np.mean(rolloff),
        'zero_crossing_rate': np.mean(zero_crossing_rate),        
    }
    for i, c in enumerate(mfcc):
        feature_row[f'mfcc{i+1}'] = np.mean(c)
    return feature_row

# Kaggle CCR

In [4]:
df_dataset = pd.read_csv('/content/cough_trial_extended.csv')

features_cols = ['filename', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate']
for i in range(1, 21):
    features_cols.append(f'mfcc{i}')
features_cols.append('label')

df_features = pd.DataFrame(columns=features_cols)

for row_index, row in tqdm(df_dataset.iterrows(), total=len(df_dataset)):
    filename = os.path.join('/content/trial_covid/', row['file_properties'])
    feature_row = preproces(filename)
    feature_row['filename'] = row['file_properties']
    feature_row['label'] = row['class']
    df_features = df_features.append(feature_row, ignore_index=True)

df_features.to_csv('kaggle_ccr.csv', index=False, columns=features_cols)

df_features.head()

HBox(children=(FloatProgress(value=0.0, max=170.0), HTML(value='')))




Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,0v8MGxNetjg_ 10.000_ 20.000.wav,0.519951,0.045853,1612.895795,1411.838677,2907.580566,0.107019,-376.876007,111.017372,-31.904015,6.622255,-0.816813,-4.456445,-0.198702,-3.759593,-5.181573,-0.665209,-0.656475,-7.439712,-1.03458,-0.203084,-3.513495,-1.745705,-3.011878,-2.878482,-2.106427,-4.026825,not_covid
1,1j1duoxdxBg_ 70.000_ 80.000.wav,0.535472,0.001771,2892.087076,2467.408141,5072.664388,0.148584,-519.158447,60.781284,-13.722886,52.145428,0.05105,-2.53491,16.103621,-1.49458,4.123252,11.030884,-0.156307,-0.909973,7.216461,-1.71963,3.903021,3.653039,3.043882,2.439957,2.781968,2.195162,not_covid
2,1MSYO4wgiag_ 120.000_ 130.000.wav,0.496666,0.033657,3429.061935,2788.634413,6886.288452,0.225315,-282.297913,48.58168,-15.522366,12.710723,4.19998,-7.577727,-19.324192,-12.037647,-16.901482,13.693965,0.829615,-6.066336,-4.16764,1.017302,-0.523806,0.538693,-8.855953,-2.927977,-1.118562,-5.906226,not_covid
3,1PajbAKd8Kg_ 0.000_ 10.000.wav,0.407549,0.013452,2710.811637,2664.28755,5778.474935,0.142076,-346.8573,75.765617,-7.648194,11.362121,11.365475,1.842426,-7.957006,-4.264208,-8.28397,3.105164,-2.838681,5.053118,-0.291308,0.987186,-2.447526,3.692367,2.312328,-2.059656,-4.772599,-0.503851,not_covid
4,cov1.wav,0.412697,0.059004,1555.648634,1418.599932,2870.737092,0.133998,-340.588013,104.1567,-32.228443,-13.615362,-3.029664,0.51209,-21.811838,-17.781813,-9.270074,-5.505614,-5.385945,-8.247169,0.940006,-5.701087,-6.32663,-1.08004,-1.812609,-2.518986,-3.684266,-3.564146,covid


# Virufy

In [5]:
df_dataset = pd.DataFrame(columns=['file_properties', 'class'])
for filename in glob.glob('/content/drive/MyDrive/Data/virufy-data-main/virufy-data-main/clinical/segmented/pos/*.mp3'):
    df_dataset = df_dataset.append({'file_properties': filename, 'class': 'covid'}, ignore_index=True)
for filename in glob.glob('/content/drive/MyDrive/Data/virufy-data-main/virufy-data-main/clinical/segmented/neg/*.mp3'):
    df_dataset = df_dataset.append({'file_properties': filename, 'class': 'not_covid'}, ignore_index=True)

len(df_dataset)

121

In [6]:
features_cols = ['filename', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate']
for i in range(1, 21):
    features_cols.append(f'mfcc{i}')
features_cols.append('label')

df_features = pd.DataFrame(columns=features_cols)

for row_index, row in tqdm(df_dataset.iterrows(), total=len(df_dataset)):
    filename = row['file_properties']
    feature_row = preproces(filename)
    feature_row['filename'] = row['file_properties']
    feature_row['label'] = row['class']
    df_features = df_features.append(feature_row, ignore_index=True)

df_features.to_csv('virufy.csv', index=False, columns=features_cols)

df_features.head()

HBox(children=(FloatProgress(value=0.0, max=121.0), HTML(value='')))






Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,/content/drive/MyDrive/Data/virufy-data-main/v...,0.258495,0.053727,1278.808455,1113.288004,2424.669879,0.075379,-383.16571,61.049183,-27.013086,19.257313,-10.895571,-5.258912,-20.324905,-3.844947,-9.479165,-2.150488,-5.740175,-16.365189,2.426744,-3.453335,-0.386767,-10.834597,-7.11694,-4.170108,-9.392747,-9.053766,covid
1,/content/drive/MyDrive/Data/virufy-data-main/v...,0.28645,0.042565,1631.026245,1226.046525,2987.185802,0.115418,-407.933411,41.559742,-15.610929,22.209257,-29.088467,2.527946,-7.549842,-3.350578,-6.783246,-6.552319,-8.756191,-7.504036,1.1555,-9.509875,-3.691084,-10.472419,-3.672274,-4.85911,-6.863937,-2.269574,covid
2,/content/drive/MyDrive/Data/virufy-data-main/v...,0.28486,0.046747,1389.359658,1125.533026,2703.665294,0.079986,-405.975616,52.124725,-20.87925,19.777031,-21.697788,5.96222,-17.441412,-7.268092,-2.737788,-0.126589,-5.40941,-10.03756,0.527219,-8.965367,-1.280596,-7.985561,-4.738081,-2.565717,2.260611,-1.890121,covid
3,/content/drive/MyDrive/Data/virufy-data-main/v...,0.193475,0.056363,848.713705,798.544002,1720.003609,0.045134,-382.513794,58.237743,-24.468456,16.174303,-14.976987,1.535332,-15.713202,-2.513941,-12.511707,0.627165,5.577144,-13.171568,0.291199,-8.729095,1.072134,-5.707059,-7.103181,-6.136421,-5.780864,-1.996612,covid
4,/content/drive/MyDrive/Data/virufy-data-main/v...,0.094829,0.042812,681.564432,586.640896,1343.952743,0.03476,-455.134338,30.373945,-15.498266,8.299889,-15.300827,4.632612,-16.490538,-1.874918,-0.671651,1.362566,-4.706092,-7.240338,2.862604,-6.772067,-2.223145,-6.514351,-1.842205,-2.371292,2.044255,0.577615,covid


#Coswara

In [7]:
import subprocess
import os

def run_cmd(cmd, stderr = subprocess.STDOUT):
    out = None
    try:
        out = subprocess.check_output([cmd], shell=True, stderr = subprocess.STDOUT, universal_newlines = True)
    except subprocess.CalledProcessError as e:
        print(f'ERROR {e.returncode}: {cmd}\n\t{e.output}', flush=True, file=sys.stderr)
        raise e
    return out

def clone_data(data_root):
    clone_uri = 'https://github.com/iiscleap/Coswara-Data'
    if os.path.exists(data_root):
        assert os.path.isdir(data_root), \
        f'{data_root} should be cloned from {clone_uri}'
    else:
        print('Cloning....', flush=True)
        run_cmd(f'git clone {clone_uri} {data_root}')

data_root = "./data"
clone_data(data_root)

Cloning....


In [8]:
!cd /content/data
!python /content/data/extract_data.py

Extraction process complete!


In [10]:
import os
import sys
import subprocess
import numpy as np
import glob
import json
import pandas as pd

'''
This script creates a folder "Extracted_data" inside which it extracts all the wav files in the directories date-wise
'''

coswara_data_dir = os.path.abspath('/content/data') # Local Path of iiscleap/Coswara-Data Repo
extracted_data_dir = os.path.join(coswara_data_dir, 'Extracted_data')  

if not os.path.exists(coswara_data_dir):
  raise("Check the Coswara dataset directory!")

if not os.path.exists(extracted_data_dir):
  os.makedirs(extracted_data_dir) # Creates the Extracted_data folder if it doesn't exist

dirs_extracted = set(map(os.path.basename,glob.glob('{}/202*'.format(extracted_data_dir))))
dirs_all = set(map(os.path.basename,glob.glob('{}/202*'.format(coswara_data_dir))))

dirs_to_extract = list(set(dirs_all) - dirs_extracted)

for d in dirs_to_extract:
  p = subprocess.Popen('cat {}/{}/*.tar.gz.* |tar -xvz -C {}/'.format(coswara_data_dir, d, extracted_data_dir), shell=True)
  p.wait()


print("Extraction process complete!")

Extraction process complete!


In [11]:
dirs = os.listdir('/content/data/Extracted_data')

In [12]:
heavy = []
shallow = []
pre_dir = []
for filename in dirs:
  subpath = os.path.join('/content/data/Extracted_data', filename)
  #print(subpath)
  for filename_2 in os.listdir(subpath):
    subpath_2 = os.path.join(subpath, filename_2)
    #print(subpath_2)
    pre_dir.append(subpath_2)

In [13]:
for voice in pre_dir:
  heavy.append(voice + '/cough-heavy.wav')
  shallow.append(voice + '/cough-shallow.wav')

In [14]:
import pandas as pd
import requests
import io
    
# Downloading the csv file from your GitHub account
url = "https://raw.githubusercontent.com/iiscleap/Coswara-Data/master/combined_data.csv" # Make sure the url is the raw version of the file on GitHub
download = requests.get(url).content
# Reading the downloaded content and turning it into a pandas dataframe
df = pd.read_csv(io.StringIO(download.decode('utf-8')))
# Printing out the first 5 rows of the dataframe
df.head()

Unnamed: 0,id,l_c,a,covid_status,ep,g,l_s,l_l,diabetes,asthma,smoker,ht,fever,rU,um,cold,cough,mp,loss_of_smell,st,ftg,bd,cld,ihd,pneumonia,test_status,diarrhoea
0,vK2bLRNzllXNeyOMudnNSL5cfpG2,India,24,healthy,y,male,Karnataka,,,,,,,,,,,,,,,,,,,,
1,bjA2KpSxneNskrLBeqi4bqoTDQl2,India,72,healthy,y,male,Maharashtra,Thane,,,,,,,,,,,,,,,,,,,
2,FSzobvJqOXf0rI6X05cHqOiU9Mu2,India,54,healthy,y,male,Maharashtra,Thane West,,,,,,,,,,,,,,,,,,,
3,EqDWckxbsETyHUeBLQ8jLtxlhir2,India,31,healthy,y,male,Karnataka,Bangalore,,,,,,,,,,,,,,,,,,,
4,FGRDO4IBbAejR0WHD5YbkXTCasg2,India,26,healthy,y,male,Haryana,gurgaon,,,,,,,,,,,,,,,,,,,


In [15]:
dir_id = pre_dir[0].split('/')[-1]
dir_id

'z2ormj7bF7Zkgibaf97vOCUqrm62'

In [16]:
id = []
for pre_dir_path in pre_dir:
  id.append(pre_dir_path.split('/')[-1])

In [17]:
df.rename(columns={'id':'original_id'}, inplace=True)
df['covid_status'].value_counts()

healthy                        1263
no_resp_illness_exposed         130
positive_mild                   103
resp_illness_not_identified      85
recovered_full                   30
positive_asymp                   21
positive_moderate                13
Name: covid_status, dtype: int64

In [18]:
df.replace({'healthy':'non-covid', 'no_resp_illness_exposed':'might be', 'positive_mild':'covid',
            'positive_moderate':'covid', 'positive_asymp':'covid', 'recovered_full':'migth be', 
            'resp_illness_not_identified':'asymptomatic'}, inplace=True)

In [19]:
df['covid_status'].value_counts()

non-covid       1263
covid            137
might be         130
asymptomatic      85
migth be          30
Name: covid_status, dtype: int64

In [20]:
data = df.loc[df['covid_status'] == 'covid', :]

cough = []
labels  = [0]
for filename in id:
  if filename in data['original_id'].tolist():
    for filename_2, comp_path in zip(filename, pre_dir):
      #print(filename, comp_path)
      cough.append(comp_path)

In [21]:
heavy = []
shallow = []
pre_dir2 = []
for filename in dirs:
  subpath = os.path.join('/content/data/Extracted_data', filename)
  #print(subpath)
  for filename_2 in os.listdir(subpath):
    if filename_2 in data['original_id'].tolist():
      subpath_2 = os.path.join(subpath, filename_2)
      #print(subpath_2)
      pre_dir2.append(subpath_2)

In [22]:
heavy_voice_data = pd.DataFrame()
heavy_voice_data['path'] = pre_dir2
heavy_voice_data['labels'] = 'covid'

shallow_voice_data = pd.DataFrame()
shallow_voice_data['path'] = pre_dir2
shallow_voice_data['labels'] = 'covid'

heavy_name = []
for name_h in heavy_voice_data['path']:
  heavy_name.append(name_h + '/cough-heavy.wav')
heavy_voice_data['path'] = heavy_name

shallow_name = []
for name_s in shallow_voice_data['path']:
  shallow_name.append(name_s + '/cough-shallow.wav')
shallow_voice_data['path'] = shallow_name

voice_data = pd.concat([heavy_voice_data, shallow_voice_data], axis=0)
print(voice_data.shape)

(274, 2)


In [23]:
healthy = df.loc[df['covid_status'] == 'non-covid', :]
healthy = healthy.sample(n=60)

healthy.head()

Unnamed: 0,original_id,l_c,a,covid_status,ep,g,l_s,l_l,diabetes,asthma,smoker,ht,fever,rU,um,cold,cough,mp,loss_of_smell,st,ftg,bd,cld,ihd,pneumonia,test_status,diarrhoea
1115,ikYIaaUJutT91dYfNqA7iQoAQKN2,India,12,non-covid,y,male,Karnataka,,,,,,,,,,,,,,,,,,,,
734,8nVQh7sxmWTMdHOSFZRgCSbwFCl2,India,23,non-covid,y,male,Karnataka,B.E.L colony Jalahallli Bangalore,,,,,,n,n,,,,,,,,,,,,
1247,Lz3Pu5AOFbQLTA8cIqRAmjPy8qw1,India,34,non-covid,y,male,Maharashtra,Mumbai,True,,,,,n,y,,,,,,,,,,,,
1208,6Or8DONUwObeo4MH2Sbtx7JTVNU2,India,27,non-covid,y,male,West Bengal,Purulia,,,,,,n,n,,,,,,,,,,,,
864,KmMFugVd4Pa4fZW9FqWFcHBVovf2,India,25,non-covid,y,male,Telangana,,,,,,,n,n,,,,,,,True,,,,,


In [24]:
heavy = []
shallow = []
pre_dir3 = []
for filename in dirs:
  subpath = os.path.join('/content/data/Extracted_data', filename)
  #print(subpath)
  for filename_2 in os.listdir(subpath):
    if filename_2 in healthy['original_id'].tolist():
      subpath_2 = os.path.join(subpath, filename_2)
      #print(subpath_2)
      pre_dir3.append(subpath_2)

heavy_voice_data_healthy = pd.DataFrame()
heavy_voice_data_healthy['path'] = pre_dir3
heavy_voice_data_healthy['labels'] = 'non-covid'


shallow_voice_data_healthy = pd.DataFrame()
shallow_voice_data_healthy['path'] = pre_dir3
shallow_voice_data_healthy['labels'] = 'non-covid'

heavy_name_healthy = []
for name_hh in heavy_voice_data_healthy['path']:
  heavy_name_healthy.append(name_hh + '/cough-heavy.wav')
heavy_voice_data_healthy['path'] = heavy_name_healthy

shallow_name_h = []
for name_sh in shallow_voice_data_healthy['path']:
  shallow_name_h.append(name_sh + '/cough-shallow.wav')
shallow_voice_data_healthy['path'] = shallow_name_h

healthy_voice_data = pd.concat([heavy_voice_data_healthy, shallow_voice_data_healthy], axis=0)
print(healthy_voice_data.shape)

(120, 2)


In [25]:
voice_data.reset_index(drop=True, inplace=True)
healthy_voice_data.reset_index(drop=True, inplace=True)
healthy_voice_data['path'][61]

'/content/data/Extracted_data/20200525/KVKLfU8zZ1MEywuVYflH2KYOHr52/cough-shallow.wav'

In [26]:
final_data = pd.concat([voice_data, healthy_voice_data], axis=0)
final_data.reset_index(drop=True, inplace=True)

final_data.shape

(394, 2)

In [27]:
final_data.to_csv('final_data.csv', index=False)
from google.colab import files
files.download('final_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Feature Extraction

In [28]:
df_dataset = pd.read_csv('/content/final_data.csv')
# removing corrupted data
df_dataset = df_dataset[df_dataset['path'] != '/content/data/Extracted_data/20200502/mNcNjQMsv8aZXFGuguWbLdmkOQk2/cough-heavy.wav']
df_dataset = df_dataset[df_dataset['path'] != '/content/data/Extracted_data/20200502/mNcNjQMsv8aZXFGuguWbLdmkOQk2/cough-shallow.wav']

In [29]:
def preprocess(fn_wav):
    y, sr = librosa.load(fn_wav, mono = True, duration = 5)
    chroma_stft = librosa.feature.chroma_stft(y = y, sr = sr)
    rmse = librosa.feature.rms(y = y)
    spectral_centroid = librosa.feature.spectral_centroid(y = y, sr = sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y = y, sr = sr)
    rolloff = librosa.feature.spectral_rolloff(y = y, sr = sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y = y, sr = sr)
    
    feature_row = {        
        'chroma_stft': np.mean(chroma_stft),
        'rmse': np.mean(rmse),
        'spectral_centroid': np.mean(spectral_centroid),
        'spectral_bandwidth': np.mean(spectral_bandwidth),
        'rolloff': np.mean(rolloff),
        'zero_crossing_rate': np.mean(zero_crossing_rate),        
    }
    for i, c in enumerate(mfcc):
        feature_row[f'mfcc{i+1}'] = np.mean(c)
    return feature_row

features_cols = ['filename', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate']
for i in range(1, 21):
    features_cols.append(f'mfcc{i}')
features_cols.append('label')

df_features = pd.DataFrame(columns=features_cols)

for row_index, row in tqdm(df_dataset.iterrows(), total=len(df_dataset)):
    filename = row['path']
    print(filename)
    feature_row = preprocess(filename)
    feature_row['filename'] = row['path']
    feature_row['label'] = row['labels']
    df_features = df_features.append(feature_row, ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=394.0), HTML(value='')))

/content/data/Extracted_data/20210406/BfkSf0P6WcOlQW1jfLuLKEPusF82/cough-heavy.wav
/content/data/Extracted_data/20210406/ZIzrdWNQK1Xhn6o6jrzSVBMdphp2/cough-heavy.wav
/content/data/Extracted_data/20210406/kAnUnZf9b6d4bkjJCawv0nCGJUI2/cough-heavy.wav
/content/data/Extracted_data/20210406/IimA0GnOm7ZR4nEr8k0kG4eaR7A3/cough-heavy.wav
/content/data/Extracted_data/20210406/6ZovArshO1MTe2tpxFWrHmORa2R2/cough-heavy.wav
/content/data/Extracted_data/20210406/Fh3hBZZRh2UXXicTNGMei8ikbSu2/cough-heavy.wav
/content/data/Extracted_data/20210406/pWFMPFBys1bBerYz5Si4Gb8brGn1/cough-heavy.wav
/content/data/Extracted_data/20210406/4sjgUTKdPXcgwHbPcqEtB6WA7XB3/cough-heavy.wav
/content/data/Extracted_data/20210406/0drfnqVwccT6UmgVLQEuyVglDC23/cough-heavy.wav
/content/data/Extracted_data/20210406/00xKcQMmcAhX8CODgBBLOe7Dm0T2/cough-heavy.wav
/content/data/Extracted_data/20210406/W42AKc6SZlNz1NOBAJrCpa3Io0E2/cough-heavy.wav
/content/data/Extracted_data/20210406/UnJVcWCaLzSZJS8B0YJmDwF9d143/cough-heavy.wav
/con

In [30]:
import tqdm
from tqdm import tqdm
import librosa
import numpy as np

df_features.to_csv('Coswara_processed.csv', index=False)
files.download('Coswara_processed.csv')

virufy = pd.read_csv('/content/virufy.csv')
kaggle_ccr = pd.read_csv('/content/kaggle_ccr.csv')

tabular_pro = pd.concat([df_features, virufy, kaggle_ccr], axis=0)
tabular_pro.shape

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(685, 28)

In [31]:
tabular_pro['label'].replace({'not_covid':'non-covid'}, inplace=True)
tabular_pro['label'].value_counts()

non-covid    344
covid        341
Name: label, dtype: int64

In [33]:
tabular_pro.reset_index(drop=True, inplace=True)
tabular_pro.to_csv('Tabular_pro.csv', index = False)
files.download('Tabular_pro.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>