In [None]:
!mkdir -p ../../audio_set

In [None]:
# Download data from http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv 
# Put it in ../../data

In [1]:
import io
import pandas as pd
from collections import deque

In [2]:
ROOT_DIR="../../data/audio_set"
RAW_FILE="{}/{}".format(ROOT_DIR, "unbalanced_train_segments.csv")
CLEANED_FILE="{}/{}".format(ROOT_DIR, "cleaned_train_segments.csv")

PANDAS_IN_FILE="{}/{}".format(ROOT_DIR, "cough_in_train_segments.csv")

In [3]:
# Remove the first 3 lines of file
def efficient_dropfirst(f, nf, dropfirst=1, buffersize=3):
    f.seek(0)
    buffer = deque()
    tail_pos = 0
    # these next two loops assume the file has many thousands of
    # lines so we can safely drop and buffer the first few...
    for _ in range(dropfirst):
        f.readline()
    for _ in range(buffersize):
        buffer.append(f.readline())
    line = f.readline()
    while line:
        buffer.append(line)
        head_pos = f.tell()
        f.seek(tail_pos)
        tail_pos += nf.write(buffer.popleft())
        f.seek(head_pos)
        line = f.readline()
    f.seek(tail_pos)
    # finally, clear out the buffer:
    while buffer:
        line = buffer.popleft()
        nf.write(buffer.popleft())
    nf.truncate()

In [37]:
!head  $RAW_FILE

# Segments csv created Sun Mar  5 10:56:58 2017
# num_ytids=2041789, num_segs=2041789, num_unique_labels=527, num_positive_labels=4020212
# YTID, start_seconds, end_seconds, positive_labels
---1_cCGK4M, 0.000, 10.000, "/m/01g50p,/m/0284vy3,/m/06d_3,/m/07jdr,/m/07rwm0c"
---2_BBVHAA, 30.000, 40.000, "/m/09x0r"
---B_v8ZoBY, 30.000, 40.000, "/m/04rlf"
---EDNidJUA, 30.000, 40.000, "/m/02qldy,/m/02zsn,/m/05zppz,/m/09x0r"
---N4cFAE1A, 21.000, 31.000, "/m/04rlf,/m/09x0r"
---fcVQUf3E, 30.000, 40.000, "/m/019jd,/m/07yv9"
---g9OGAhwc, 30.000, 40.000, "/m/04rlf,/m/0c1dj"


In [46]:
with open(RAW_FILE) as f:
    with open(CLEANED_FILE, "w") as nf:
        efficient_dropfirst(f, nf, 3, 10)

In [40]:
!head -n 2 $CLEANED_FILE

---1_cCGK4M, 0.000, 10.000, "/m/01g50p,/m/0284vy3,/m/06d_3,/m/07jdr,/m/07rwm0c"
---2_BBVHAA, 30.000, 40.000, "/m/09x0r"


In [60]:
!rm $PANDAS_IN_FILE

In [70]:
#clean up for pandas

with open(PANDAS_IN_FILE, "w+") as pin:
    pin.write("""youtubeid,start,end,tags\n""")
    with open(CLEANED_FILE, "r") as cf:
        new_lines = []
        for line in cf:
            cols = line.split(",")
            new_line = ""
            
            new_line = ",".join([cols[0], cols[1], cols[2], " ".join(cols[3:])])
            
            new_lines.append(new_line)
            
        pin.write("\n".join(new_lines))
            
            
            
            

In [71]:
!head -n 2 $PANDAS_IN_FILE

youtubeid,start,end,tags
---1_cCGK4M, 0.000, 10.000, "/m/01g50p /m/0284vy3 /m/06d_3 /m/07jdr /m/07rwm0c"


In [28]:
candidate_df = pd.read_csv(PANDAS_IN_FILE)

In [39]:
coughs_df = candidate_df[candidate_df["tags"].str.contains("/m/01b_21")]

In [95]:
not_coughs_df = candidate_df[~candidate_df["tags"].str.contains("/m/01b_21")]

In [126]:
not_coughs_df = not_coughs_df.sample(n = len(coughs_df), random_state = 2312) 

In [0]:
# NEXT STEPS 
# Get cough videos and download and process to PDM using FFMPEG

In [35]:
def intToTime(time):
    mins = int(time/60)
    secs = int(time - (mins * 60))
    
    return "{}:{}".format(mins, secs)

In [112]:
def genCmd(cough, name):
    cmd = "ffmpeg -y -hide_banner -loglevel panic $(youtube-dl -g 'https://youtube.com/watch?v=" + cough.youtubeid 
    cmd = cmd + "'"
    cmd = cmd + """ | sed -n '2p' | sed "s/.*/-ss {} -i &/") -t {} downloaded/{}/{}.wav  """.format(cough.start, name, cough.end, cough.youtubeid)
    
    return cmd

In [114]:
cough_scripts = list(coughs_df.apply(lambda x: genCmd(x, "cough"), axis=1))

not_cough_scripts = list(not_coughs_df.apply(lambda x: genCmd(x, "not_cough"), axis=1))

In [120]:
def gen_script(scripts, script_name):
    with open("./{}.sh".format(script_name), "w+") as f:
        cough_script = ""
        downloaded = 0
        total = len(scripts)
        for cough in scripts:
            f.write(cough + " \n")
            downloaded = downloaded + 1
            f.write("echo 'downloaded {}/{}' \n".format(downloaded, total) )

In [117]:
!mkdir -p downloaded/cough

In [118]:
!mkdir -p downloaded/not_cough

In [121]:
gen_script(cough_scripts, "cough_downloads")

In [129]:
gen_script(not_cough_scripts, "not_cough_downloads")

In [None]:
!sh 3cough_downloads.sh

In [130]:
!sh not_cough_downloads.sh

downloaded 1/751
downloaded 2/751
downloaded 3/751
downloaded 4/751
downloaded 5/751
downloaded 6/751
downloaded 7/751
downloaded 8/751
If the owner of this video has granted you access, please sign in.
       This video is private.
downloaded 9/751
downloaded 10/751
If the owner of this video has granted you access, please sign in.
       This video is private.
downloaded 11/751
downloaded 12/751
downloaded 13/751
downloaded 14/751
downloaded 15/751
downloaded 16/751
downloaded 17/751
downloaded 18/751
downloaded 19/751
downloaded 20/751
downloaded 21/751
downloaded 22/751
downloaded 23/751
downloaded 24/751
downloaded 25/751
downloaded 26/751
downloaded 27/751
downloaded 28/751
downloaded 29/751
downloaded 30/751
downloaded 31/751
downloaded 32/751
downloaded 33/751
downloaded 34/751
downloaded 35/751
downloaded 36/751
downloaded 37/751
downloaded 38/751
downloaded 39/751
downloaded 40/751
[0;31mERROR:[0m qk6QUuBIFa0: YouTube said: This video is unavailable.
Sorry about that.
downl

downloaded 222/751
downloaded 223/751
downloaded 224/751
downloaded 225/751
downloaded 226/751
downloaded 227/751
[0;31mERROR:[0m N4EEFZ5dyY8: YouTube said: This video is unavailable.
Sorry about that.
downloaded 228/751
downloaded 229/751
downloaded 230/751
downloaded 231/751
downloaded 232/751
downloaded 233/751
downloaded 234/751
downloaded 235/751
downloaded 236/751
downloaded 237/751
downloaded 238/751
downloaded 239/751
downloaded 240/751
downloaded 241/751
downloaded 242/751
downloaded 243/751
downloaded 244/751
[0;31mERROR:[0m W3BFRCmDdWQ: YouTube said: This video contains content from Disney, who has blocked it on copyright grounds.
Sorry about that.
downloaded 245/751
downloaded 246/751
downloaded 247/751
downloaded 248/751
downloaded 249/751
downloaded 250/751
downloaded 251/751
downloaded 252/751
downloaded 253/751
[0;31mERROR:[0m fslMZNQMAPU: YouTube said: This video is unavailable.
Sorry about that.
downloaded 254/751
[0;31mERROR:[0m 9t5-6ukWISQ: YouTube said: Thi

downloaded 449/751
[0;31mERROR:[0m wKfnrkETEJM: YouTube said: This video is unavailable.
Sorry about that.
downloaded 450/751
downloaded 451/751
downloaded 452/751
[0;31mERROR:[0m mG3e9_Tw4Ns: YouTube said: This video is unavailable.
Sorry about that.
downloaded 453/751
downloaded 454/751
downloaded 455/751
downloaded 456/751
downloaded 457/751
downloaded 458/751
[0;31mERROR:[0m quO_wHYaowY: YouTube said: This video is unavailable.
Sorry about that.
downloaded 459/751
downloaded 460/751
downloaded 461/751
downloaded 462/751
[0;31mERROR:[0m FjPJOkXXvvA: YouTube said: This video contains content from TMSanime, who has blocked it in your country on copyright grounds.
Sorry about that.
downloaded 463/751
downloaded 464/751
downloaded 465/751
[0;31mERROR:[0m IVzguNqubd0: YouTube said: This video is unavailable.
Sorry about that.
downloaded 466/751
downloaded 467/751
downloaded 468/751
downloaded 469/751
downloaded 470/751
downloaded 471/751
downloaded 472/751
downloaded 473/751
do

downloaded 705/751
downloaded 706/751
downloaded 707/751
downloaded 708/751
downloaded 709/751
downloaded 710/751
downloaded 711/751
downloaded 712/751
downloaded 713/751
downloaded 714/751
downloaded 715/751
downloaded 716/751
downloaded 717/751
downloaded 718/751
downloaded 719/751
downloaded 720/751
downloaded 721/751
[0;31mERROR:[0m IO88jldhpAA: YouTube said: This video is unavailable.
Sorry about that.
downloaded 722/751
downloaded 723/751
[0;31mERROR:[0m SbFouxkBAxw: YouTube said: This video is unavailable.
Sorry about that.
downloaded 724/751
downloaded 725/751
downloaded 726/751
downloaded 727/751
downloaded 728/751
downloaded 729/751
downloaded 730/751
downloaded 731/751
downloaded 732/751
downloaded 733/751
downloaded 734/751
downloaded 735/751
downloaded 736/751
downloaded 737/751
downloaded 738/751
downloaded 739/751
downloaded 740/751
downloaded 741/751
downloaded 742/751
downloaded 743/751
downloaded 744/751
downloaded 745/751
downloaded 746/751
downloaded 747/751
dow

In [None]:
cou