Create_Virufy_images.ipynb

Creation of images from audio files.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

folder = '/content/drive/My Drive/Colab Notebooks/virufy'
if not os.path.exists(folder):
  print(folder + ' does not exist')
data_folder = '/content/drive/My Drive/Colab Notebooks/virufy/public_dataset'
if not os.path.exists(data_folder):
  print(data_folder + ' does not exist')

In [3]:
import pandas as pd
import os
import librosa
import librosa.display
import cv2
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

Read in dataframe created previously.

In [4]:
df_virufy = pd.read_csv(os.path.join(folder, 'df_virufy.csv'))
df_virufy.head(10)

Unnamed: 0.1,Unnamed: 0,id,age,gender,status,audio_file
0,0,5f5b48f12cf4d_1599817969,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...
1,1,5f5b4915b89b4_1599818005,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...
2,5,5f6071ac6905d_1600156076,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...
3,6,5f607226e921f_1600156198,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...
4,7,5f853d0a79c03_1602567434,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...
5,8,5f8d97335a3b4_1603114803,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...
6,9,5f94219c38f1a_1603543452,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...
7,10,5f9422344fcfc_1603543604,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...
8,11,5f944434c0ac9_1603552308,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...
9,12,5fa4cb64b37f9_16046354925fa4cb64b42c3,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...


In [5]:
print(df_virufy.audio_file[0])
print(df_virufy.audio_file[5])

/content/drive/My Drive/Colab Notebooks/virufy/public_dataset/5f5b48f12cf4d_1599817969.flac
/content/drive/My Drive/Colab Notebooks/virufy/public_dataset/5f8d97335a3b4_1603114803.flac


Use code from virufy.com to change audio files into image files.
(I have made changes to reflect my variable names and I have not split into test and train yet which they had done with their data.)

In [6]:
# Functions to process audio files into images (adapted from code at virufy.com)
def trim_silence(x, *args):
    try:pad,db_max,frame_length,hop_length = args[0],args[1],args[2],args[3]
    except: 
        print('Please enter the following arguments: pad,db_max,frame_length,hop_length')
        return

    _, ints = librosa.effects.trim(x, top_db=db_max, frame_length=256, hop_length=64)
    start = int(max(ints[0]-pad, 0))
    end   = int(min(ints[1]+pad, len(x)))
    return x[start:end]

def process_cough_file(path,trim,*args):
    try: sr,removeaudio,chunk,db_max = args[0],args[1],args[2],args[3]
    except: 
        sr,removeaudio,chunk,db_max= 48000,False,3,50
    try:
        x,sr = librosa.load(path, sr=sr)       
    except: 
        return -1
    
    if len(x)/sr < 0.3 or len(x)/sr > 30:
        return None,None  
    hop_length = np.floor(0.010*sr).astype(int) #10ms
    win_length = np.floor(0.020*sr).astype(int) #20ms  

    if removeaudio:
        os.remove(path)
    
    x = trim(x, 0.25*sr, db_max,win_length,hop_length) 
    x = x[:np.floor(chunk*sr).astype(int)]
    
    #pads to chunk size if smaller
    x_pad = np.zeros(int(sr*chunk))
    x_pad[:min(len(x_pad), len(x))] = x[:min(len(x_pad), len(x))]

    return [x_pad,sr,hop_length,win_length]

def get_melspec(sdir,audio,sr,name):
    #Mel Spectogram
    plt.ioff()
    fig      = plt.figure()
    melspec  = librosa.feature.melspectrogram(y=audio,sr=sr)
    s_db     = librosa.power_to_db(melspec, ref=np.max)
    librosa.display.specshow(s_db)
    fig.canvas.draw()
    img = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
    img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    plt.close(fig=fig)
    #img = img[80:250,80:300]

    savepath = os.path.join(sdir,name+'.png') # Currently saving melspectrogram images to the folders specified in extract features
    cv2.imwrite(savepath,img)
    return savepath

def getlabel(key, dataframe, chosen):
      return dataframe.loc[dataframe[chosen['id']]==key][chosen['status']].tolist()[0]

def extract(df, chosen, savedir):
    if not os.path.isdir(savedir):
        os.mkdir(savedir)
        
    keys, dirs = df[chosen['id']].tolist(),df[chosen['path']].tolist()  
    audio_objs = [process_cough_file(path,trim_silence) for path in dirs]
    false_indices = [i for i in range(len(audio_objs)) if isinstance(audio_objs[i],int) or isinstance(audio_objs[i],tuple)]

    audio_objs = [audio_objs[i] for i in range(len(audio_objs)) if i not in false_indices]
    audio_objs = np.array(audio_objs)
    audio,sr,hop_length,win_length = audio_objs[:,0],audio_objs[:,1],audio_objs[:,2],audio_objs[:,3]
    
    dirs = [dirs[i] for i in range(len(dirs)) if i not in false_indices]
    keys = [keys[i] for i in range(len(keys)) if i not in false_indices]
    data = {key:{'DIR':get_melspec(savedir,a_i,sr_i,key),
             'label':getlabel(key, df, chosen)} for key,a_i,sr_i in list(zip(keys,audio,sr))}
    return data

def filter_DF(df):
    names = list(df.columns)
    chosen= {}
    for name in names:
        if 'status' in name.lower():chosen['status'] = name # Choosing the target
        elif 'audio_file' in name.lower():chosen['path'] = name
        elif 'id' in name.lower() or 'id' == name.lower() :chosen['id'] = name
    return df[[chosen['id'],chosen['status'],chosen['path']]].dropna().reset_index(), chosen 

def create_images(df):
    dataframe, chosen = filter_DF(df)
    features = extract(dataframe, chosen, (folder + '/virufy_images/'))
    return features

Processing the audio files into images.


In [7]:
features = create_images(df_virufy.iloc[:5])

In [8]:
print(features)

{'5f5b48f12cf4d_1599817969': {'DIR': '/content/drive/My Drive/Colab Notebooks/virufy/virufy_images/5f5b48f12cf4d_1599817969.png', 'label': 'COVID-19'}, '5f5b4915b89b4_1599818005': {'DIR': '/content/drive/My Drive/Colab Notebooks/virufy/virufy_images/5f5b4915b89b4_1599818005.png', 'label': 'healthy'}, '5f6071ac6905d_1600156076': {'DIR': '/content/drive/My Drive/Colab Notebooks/virufy/virufy_images/5f6071ac6905d_1600156076.png', 'label': 'COVID-19'}, '5f607226e921f_1600156198': {'DIR': '/content/drive/My Drive/Colab Notebooks/virufy/virufy_images/5f607226e921f_1600156198.png', 'label': 'COVID-19'}, '5f853d0a79c03_1602567434': {'DIR': '/content/drive/My Drive/Colab Notebooks/virufy/virufy_images/5f853d0a79c03_1602567434.png', 'label': 'COVID-19'}}


In [9]:
features = create_images(df_virufy.iloc[5:])

Create dataframe of new image files.

In [10]:
fnames = [fname for fname in os.listdir(folder + '/virufy_images/')]

df_fnames = pd.DataFrame(columns =['id','image_file'])

for fname in fnames:
  split = fname.split(".")
  id = split[0]
  df_fnames = df_fnames.append({'id':id, 'image_file':fname}, ignore_index=True)
print(df_fnames.shape)
print(df_virufy.shape)
df_fnames.head()

(66, 2)
(68, 6)


Unnamed: 0,id,image_file
0,5f5b48f12cf4d_1599817969,5f5b48f12cf4d_1599817969.png
1,5f5b4915b89b4_1599818005,5f5b4915b89b4_1599818005.png
2,5f6071ac6905d_1600156076,5f6071ac6905d_1600156076.png
3,5f607226e921f_1600156198,5f607226e921f_1600156198.png
4,5f853d0a79c03_1602567434,5f853d0a79c03_1602567434.png


Add images to df_virufy dataframe.

In [11]:
left = df_virufy.set_index(['id'])
right = df_fnames.set_index(['id'])
df_virufy = left.join(right)
print(df_virufy.shape)
df_virufy.head()

(68, 6)


Unnamed: 0_level_0,Unnamed: 0,age,gender,status,audio_file,image_file
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5f5b48f12cf4d_1599817969,0,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f5b48f12cf4d_1599817969.png
5f5b4915b89b4_1599818005,1,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...,5f5b4915b89b4_1599818005.png
5f6071ac6905d_1600156076,5,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f6071ac6905d_1600156076.png
5f607226e921f_1600156198,6,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f607226e921f_1600156198.png
5f853d0a79c03_1602567434,7,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f853d0a79c03_1602567434.png


Remove rows where image_file does not exist.

In [12]:
print(df_virufy.image_file.isnull().sum())

1


In [13]:
df_virufy = df_virufy.dropna(subset=['image_file'])
print(df_virufy.shape)
df_virufy.head()

(67, 6)


Unnamed: 0_level_0,Unnamed: 0,age,gender,status,audio_file,image_file
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5f5b48f12cf4d_1599817969,0,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f5b48f12cf4d_1599817969.png
5f5b4915b89b4_1599818005,1,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...,5f5b4915b89b4_1599818005.png
5f6071ac6905d_1600156076,5,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f6071ac6905d_1600156076.png
5f607226e921f_1600156198,6,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f607226e921f_1600156198.png
5f853d0a79c03_1602567434,7,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f853d0a79c03_1602567434.png


Too many rows so check for and remove duplicates.

In [16]:
df_virufy = df_virufy.drop_duplicates(subset=['image_file'])
print(df_virufy.shape)
df_virufy.head()

(66, 6)


Unnamed: 0_level_0,Unnamed: 0,age,gender,status,audio_file,image_file
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5f5b48f12cf4d_1599817969,0,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f5b48f12cf4d_1599817969.png
5f5b4915b89b4_1599818005,1,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...,5f5b4915b89b4_1599818005.png
5f6071ac6905d_1600156076,5,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f6071ac6905d_1600156076.png
5f607226e921f_1600156198,6,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f607226e921f_1600156198.png
5f853d0a79c03_1602567434,7,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f853d0a79c03_1602567434.png


Save df_coughvid dataframe ready to use in the next stage.

In [17]:
folder = '/content/drive/My Drive/Colab Notebooks/virufy'
df_virufy.to_csv(os.path.join(folder, 'df_virufy.csv'))

Check file has saved correctly.

In [18]:
df_virufy = pd.read_csv(os.path.join(folder, 'df_virufy.csv'))
df_virufy.head(10)

Unnamed: 0.1,id,Unnamed: 0,age,gender,status,audio_file,image_file
0,5f5b48f12cf4d_1599817969,0,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f5b48f12cf4d_1599817969.png
1,5f5b4915b89b4_1599818005,1,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...,5f5b4915b89b4_1599818005.png
2,5f6071ac6905d_1600156076,5,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f6071ac6905d_1600156076.png
3,5f607226e921f_1600156198,6,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f607226e921f_1600156198.png
4,5f853d0a79c03_1602567434,7,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f853d0a79c03_1602567434.png
5,5f8d97335a3b4_1603114803,8,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...,5f8d97335a3b4_1603114803.png
6,5f94219c38f1a_1603543452,9,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f94219c38f1a_1603543452.png
7,5f9422344fcfc_1603543604,10,,,healthy,/content/drive/My Drive/Colab Notebooks/virufy...,5f9422344fcfc_1603543604.png
8,5f944434c0ac9_1603552308,11,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5f944434c0ac9_1603552308.png
9,5fa4cb64b37f9_16046354925fa4cb64b42c3,12,,,COVID-19,/content/drive/My Drive/Colab Notebooks/virufy...,5fa4cb64b37f9_16046354925fa4cb64b42c3.png


In [19]:
df_virufy.shape

(66, 7)