In [None]:
!mkdir -p ../../audio_set

In [None]:
# Download data from http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv 
# Put it in ../../data

In [1]:
import io
import pandas as pd
from collections import deque

In [2]:
ROOT_DIR="../../data/audio_set"
RAW_FILE="{}/{}".format(ROOT_DIR, "unbalanced_train_segments.csv")
CLEANED_FILE="{}/{}".format(ROOT_DIR, "cleaned_train_segments.csv")

PANDAS_IN_FILE="{}/{}".format(ROOT_DIR, "cough_in_train_segments.csv")

In [3]:
# Remove the first 3 lines of file
def efficient_dropfirst(f, nf, dropfirst=1, buffersize=3):
    f.seek(0)
    buffer = deque()
    tail_pos = 0
    # these next two loops assume the file has many thousands of
    # lines so we can safely drop and buffer the first few...
    for _ in range(dropfirst):
        f.readline()
    for _ in range(buffersize):
        buffer.append(f.readline())
    line = f.readline()
    while line:
        buffer.append(line)
        head_pos = f.tell()
        f.seek(tail_pos)
        tail_pos += nf.write(buffer.popleft())
        f.seek(head_pos)
        line = f.readline()
    f.seek(tail_pos)
    # finally, clear out the buffer:
    while buffer:
        line = buffer.popleft()
        nf.write(buffer.popleft())
    nf.truncate()

In [37]:
!head  $RAW_FILE

# Segments csv created Sun Mar  5 10:56:58 2017
# num_ytids=2041789, num_segs=2041789, num_unique_labels=527, num_positive_labels=4020212
# YTID, start_seconds, end_seconds, positive_labels
---1_cCGK4M, 0.000, 10.000, "/m/01g50p,/m/0284vy3,/m/06d_3,/m/07jdr,/m/07rwm0c"
---2_BBVHAA, 30.000, 40.000, "/m/09x0r"
---B_v8ZoBY, 30.000, 40.000, "/m/04rlf"
---EDNidJUA, 30.000, 40.000, "/m/02qldy,/m/02zsn,/m/05zppz,/m/09x0r"
---N4cFAE1A, 21.000, 31.000, "/m/04rlf,/m/09x0r"
---fcVQUf3E, 30.000, 40.000, "/m/019jd,/m/07yv9"
---g9OGAhwc, 30.000, 40.000, "/m/04rlf,/m/0c1dj"


In [46]:
with open(RAW_FILE) as f:
    with open(CLEANED_FILE, "w") as nf:
        efficient_dropfirst(f, nf, 3, 10)

In [40]:
!head -n 2 $CLEANED_FILE

---1_cCGK4M, 0.000, 10.000, "/m/01g50p,/m/0284vy3,/m/06d_3,/m/07jdr,/m/07rwm0c"
---2_BBVHAA, 30.000, 40.000, "/m/09x0r"


In [60]:
!rm $PANDAS_IN_FILE

In [70]:
#clean up for pandas

with open(PANDAS_IN_FILE, "w+") as pin:
    pin.write("""youtubeid,start,end,tags\n""")
    with open(CLEANED_FILE, "r") as cf:
        new_lines = []
        for line in cf:
            cols = line.split(",")
            new_line = ""
            
            new_line = ",".join([cols[0], cols[1], cols[2], " ".join(cols[3:])])
            
            new_lines.append(new_line)
            
        pin.write("\n".join(new_lines))
            
            
            
            

In [71]:
!head -n 2 $PANDAS_IN_FILE

youtubeid,start,end,tags
---1_cCGK4M, 0.000, 10.000, "/m/01g50p /m/0284vy3 /m/06d_3 /m/07jdr /m/07rwm0c"


In [28]:
candidate_df = pd.read_csv(PANDAS_IN_FILE)

In [39]:
coughs_df = candidate_df[candidate_df["tags"].str.contains("/m/01b_21")]

In [95]:
not_coughs_df = candidate_df[~candidate_df["tags"].str.contains("/m/01b_21")]

In [126]:
not_coughs_df = not_coughs_df.sample(n = len(coughs_df), random_state = 2312) 

In [0]:
# NEXT STEPS 
# Get cough videos and download and process to PDM using FFMPEG

In [35]:
def intToTime(time):
    mins = int(time/60)
    secs = int(time - (mins * 60))
    
    return "{}:{}".format(mins, secs)

In [194]:
def genCmd(cough, name):
    cmd = "ffmpeg -y -hide_banner -loglevel panic $(youtube-dl -g 'https://youtube.com/watch?v=" + cough.youtubeid 
    cmd = cmd + "'"
    cmd = cmd + """ | sed -n '2p' | sed "s/.*/-ss {} -i &/") -t {} downloaded/{}/{}.wav  """.format(cough.start, cough.end, name, cough.youtubeid)
    
    return cmd

In [195]:
cough_scripts = list(coughs_df.apply(lambda x: genCmd(x, "cough"), axis=1))

not_cough_scripts = list(not_coughs_df.apply(lambda x: genCmd(x, "not_cough"), axis=1))

In [196]:
def gen_script(scripts, script_name):
    with open("./{}.sh".format(script_name), "w+") as f:
        cough_script = ""
        downloaded = 0
        total = len(scripts)
        for cough in scripts:
            f.write(cough + " \n")
            downloaded = downloaded + 1
            f.write("echo 'downloaded {}/{}' \n".format(downloaded, total) )

In [117]:
!mkdir -p downloaded/cough

In [197]:
!mkdir -p downloaded/not_cough

In [121]:
gen_script(cough_scripts, "cough_downloads")

In [198]:
gen_script(not_cough_scripts, "not_cough_downloads")

In [None]:
!sh cough_downloads.sh

In [199]:
!sh not_cough_downloads.sh

downloaded 1/751
downloaded 2/751
downloaded 3/751
downloaded 4/751
downloaded 5/751
downloaded 6/751
downloaded 7/751
downloaded 8/751
If the owner of this video has granted you access, please sign in.
       This video is private.
downloaded 9/751
downloaded 10/751
If the owner of this video has granted you access, please sign in.
       This video is private.
downloaded 11/751
downloaded 12/751
downloaded 13/751
downloaded 14/751
downloaded 15/751
downloaded 16/751
downloaded 17/751
downloaded 18/751
downloaded 19/751
downloaded 20/751
downloaded 21/751
downloaded 22/751
downloaded 23/751
downloaded 24/751
downloaded 25/751
downloaded 26/751
downloaded 27/751
downloaded 28/751
downloaded 29/751
downloaded 30/751
downloaded 31/751
downloaded 32/751
downloaded 33/751
downloaded 34/751
downloaded 35/751
downloaded 36/751
downloaded 37/751
downloaded 38/751
downloaded 39/751
downloaded 40/751
[0;31mERROR:[0m qk6QUuBIFa0: YouTube said: This video is unavailable.
Sorry about that.
downl

downloaded 222/751
downloaded 223/751
downloaded 224/751
downloaded 225/751
downloaded 226/751
downloaded 227/751
[0;31mERROR:[0m N4EEFZ5dyY8: YouTube said: This video is unavailable.
Sorry about that.
downloaded 228/751
downloaded 229/751
downloaded 230/751
downloaded 231/751
downloaded 232/751
downloaded 233/751
downloaded 234/751
downloaded 235/751
downloaded 236/751
downloaded 237/751
downloaded 238/751
downloaded 239/751
downloaded 240/751
downloaded 241/751
downloaded 242/751
downloaded 243/751
downloaded 244/751
[0;31mERROR:[0m W3BFRCmDdWQ: YouTube said: This video contains content from Disney, who has blocked it on copyright grounds.
Sorry about that.
downloaded 245/751
downloaded 246/751
downloaded 247/751
downloaded 248/751
downloaded 249/751
downloaded 250/751
downloaded 251/751
downloaded 252/751
downloaded 253/751
[0;31mERROR:[0m fslMZNQMAPU: YouTube said: This video is unavailable.
Sorry about that.
downloaded 254/751
[0;31mERROR:[0m 9t5-6ukWISQ: YouTube said: Thi

downloaded 449/751
[0;31mERROR:[0m wKfnrkETEJM: YouTube said: This video is unavailable.
Sorry about that.
downloaded 450/751
downloaded 451/751
downloaded 452/751
[0;31mERROR:[0m mG3e9_Tw4Ns: YouTube said: This video is unavailable.
Sorry about that.
downloaded 453/751
downloaded 454/751
downloaded 455/751
downloaded 456/751
downloaded 457/751
downloaded 458/751
[0;31mERROR:[0m quO_wHYaowY: YouTube said: This video is unavailable.
Sorry about that.
downloaded 459/751
downloaded 460/751
downloaded 461/751
downloaded 462/751
[0;31mERROR:[0m FjPJOkXXvvA: YouTube said: This video contains content from TMSanime, who has blocked it in your country on copyright grounds.
Sorry about that.
downloaded 463/751
downloaded 464/751
downloaded 465/751
[0;31mERROR:[0m IVzguNqubd0: YouTube said: This video is unavailable.
Sorry about that.
downloaded 466/751
downloaded 467/751
downloaded 468/751
downloaded 469/751
downloaded 470/751
downloaded 471/751
downloaded 472/751
downloaded 473/751
do

downloaded 705/751
downloaded 706/751
downloaded 707/751
downloaded 708/751
downloaded 709/751
downloaded 710/751
downloaded 711/751
downloaded 712/751
downloaded 713/751
downloaded 714/751
downloaded 715/751
downloaded 716/751
downloaded 717/751
downloaded 718/751
downloaded 719/751
downloaded 720/751
downloaded 721/751
[0;31mERROR:[0m IO88jldhpAA: YouTube said: This video is unavailable.
Sorry about that.
downloaded 722/751
downloaded 723/751
[0;31mERROR:[0m SbFouxkBAxw: YouTube said: This video is unavailable.
Sorry about that.
downloaded 724/751
downloaded 725/751
downloaded 726/751
downloaded 727/751
downloaded 728/751
downloaded 729/751
downloaded 730/751
downloaded 731/751
downloaded 732/751
downloaded 733/751
downloaded 734/751
downloaded 735/751
downloaded 736/751
downloaded 737/751
downloaded 738/751
downloaded 739/751
downloaded 740/751
downloaded 741/751
downloaded 742/751
downloaded 743/751
downloaded 744/751
downloaded 745/751
downloaded 746/751
downloaded 747/751
dow

In [134]:
!ls downloaded/cough | head -n 1

-5dCv5_nvU8.wav


In [135]:
!ffmpeg downloaded/cough/-5dCv5_nvU8.wav

ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with Apple clang version 11.0.0 (clang-1100.0.33.17)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/4.2.2_2 --enable-shared --enable-pthreads --enable-version3 --enable-avresample --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libbluray --enable-libmp3lame --enable-libopus --enable-librubberband --enable-libsnappy --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-librtmp --enable-libspeex --enable-libsoxr --enable-videotoolbox --disable-libjack --disable-indev=jack
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavform

In [150]:
import glob
import os

In [189]:
!ls downloaded

[1m[36mcough[m[m     [1m[36mnot_cough[m[m


In [183]:
!rm process_coughs_from_downloads.sh

In [191]:

def generated_process_file(dir_name):
    ffmpeg_wav_process_script = ""
    files = glob.glob('downloaded/{}/*.wav'.format(dir_name))
    finished_files = 0
    for name in files:
        finished_files = finished_files + 1
        ffmpeg_wav_process_script = "{}\n ffmpeg -y -hide_banner -loglevel panic -i {} -acodec pcm_s16le -ac 1 -ar 16000 processed/{}/{}".format(ffmpeg_wav_process_script, name, dir_name, os.path.basename(name))
        ffmpeg_wav_process_script = "{}\n echo 'done {}/{}' ".format(ffmpeg_wav_process_script, finished_files, len(files))
    with open("process_{}_from_downloads.sh".format(dir_name), "w+") as f:
        f.write(ffmpeg_wav_process_script)

In [193]:
generated_process_file("cough")

In [201]:
generated_process_file("not_cough")

In [188]:
!mkdir -p processed/cough
!mkdir -p processed/not_cough

In [187]:
!sh process_coughs_from_downloads.sh

done 1/591
done 2/591
done 3/591
done 4/591
done 5/591
done 6/591
done 7/591
done 8/591
done 9/591
done 10/591
done 11/591
done 12/591
done 13/591
done 14/591
done 15/591
done 16/591
done 17/591
done 18/591
done 19/591
done 20/591
done 21/591
done 22/591
done 23/591
done 24/591
done 25/591
done 26/591
done 27/591
done 28/591
done 29/591
done 30/591
done 31/591
done 32/591
done 33/591
done 34/591
done 35/591
done 36/591
done 37/591
done 38/591
done 39/591
done 40/591
done 41/591
done 42/591
done 43/591
done 44/591
done 45/591
done 46/591
done 47/591
done 48/591
done 49/591
done 50/591
done 51/591
done 52/591
done 53/591
done 54/591
done 55/591
done 56/591
done 57/591
done 58/591
done 59/591
done 60/591
done 61/591
done 62/591
done 63/591
done 64/591
done 65/591
done 66/591
done 67/591
done 68/591
done 69/591
done 70/591
done 71/591
done 72/591
done 73/591
done 74/591
done 75/591
done 76/591
done 77/591
done 78/591
done 79/591
done 80/591
done 81/591
done 82/591
done 83/591
done 84/591
d

In [202]:
!sh process_not_cough_from_downloads.sh

done 1/587
done 2/587
done 3/587
done 4/587
done 5/587
done 6/587
done 7/587
done 8/587
done 9/587
done 10/587
done 11/587
done 12/587
done 13/587
done 14/587
done 15/587
done 16/587
done 17/587
done 18/587
done 19/587
done 20/587
done 21/587
done 22/587
done 23/587
done 24/587
done 25/587
done 26/587
done 27/587
done 28/587
done 29/587
done 30/587
done 31/587
done 32/587
done 33/587
done 34/587
done 35/587
done 36/587
done 37/587
done 38/587
done 39/587
done 40/587
done 41/587
done 42/587
done 43/587
done 44/587
done 45/587
done 46/587
done 47/587
done 48/587
done 49/587
done 50/587
done 51/587
done 52/587
done 53/587
done 54/587
done 55/587
done 56/587
done 57/587
done 58/587
done 59/587
done 60/587
done 61/587
done 62/587
done 63/587
done 64/587
done 65/587
done 66/587
done 67/587
done 68/587
done 69/587
done 70/587
done 71/587
done 72/587
done 73/587
done 74/587
done 75/587
done 76/587
done 77/587
done 78/587
done 79/587
done 80/587
done 81/587
done 82/587
done 83/587
done 84/587
d

In [180]:
!ffmpeg -y -i "downloaded/cough/-5dCv5_nvU8.wav" -acodec pcm_s16le -ac 1 -ar 16000 out.wav

ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with Apple clang version 11.0.0 (clang-1100.0.33.17)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/4.2.2_2 --enable-shared --enable-pthreads --enable-version3 --enable-avresample --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libbluray --enable-libmp3lame --enable-libopus --enable-librubberband --enable-libsnappy --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-librtmp --enable-libspeex --enable-libsoxr --enable-videotoolbox --disable-libjack --disable-indev=jack
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat   

In [137]:
!ls

AudioSetDownload.ipynb [1m[36mdownloaded[m[m             not_cough_downloads.sh
cough_downloads.sh     dwl_script.sh          out.wav


In [139]:
import IPython
wid = IPython.display.Audio("out.wav")