In [None]:
# Path to ffmpeg
ffmpeg_path = '/usr/bin/ffmpeg'

%matplotlib inline
import sys
import os.path
# Make sure ffmpeg is on the path so sk-video can find it
sys.path.append(os.path.dirname(ffmpeg_path))
import skvideo.io
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pafy
import soundfile as sf
import subprocess as sp
import random

In [2]:
# Set output settings
audio_codec = 'wav'
audio_container = 'wav'
video_codec = 'h264'
video_container = 'mp4'

In [3]:
# Load the AudioSet training set
with open('../data/unbalanced_train_segments.csv') as f:
    lines = f.readlines()

dl_list = [[s.replace('\"', '').replace(' ','') for s in line.strip().split(',')] for line in lines[3:]]

In [4]:
# Select a YouTube video from the training set
ytid, ts_start, ts_end = dl_list[1][0:3]
ts_start, ts_end = float(ts_start), float(ts_end)
duration = ts_end - ts_start

print("YouTube ID: " + ytid)
print("Trim Window: ({}, {})".format(ts_start, ts_end))

YouTube ID: ---2_BBVHAA
Trim Window: (30.0, 40.0)


In [5]:
# Get the URL to the video page
video_page_url = 'https://www.youtube.com/watch?v={}'.format(ytid)

# Get the direct URLs to the videos with best audio and with best video (with audio)
video = pafy.new(video_page_url)

best_audio = video.getbestaudio()
best_audio_url = best_audio.url
print("Audio URL: " + best_audio_url)

Audio URL: https://r5---sn-4g5e6nsz.googlevideo.com/videoplayback?fvip=5&lmt=1460070157849997&ei=BkrSWrvtMMfh1gK6ur_gCQ&id=o-ADFIBS-jPS9TnfyiR47-ZwI6hOrI1N9u9F5Uza51Vpao&initcwndbps=1813750&sparams=clen%2Cdur%2Cei%2Cgir%2Cid%2Cinitcwndbps%2Cip%2Cipbits%2Citag%2Ckeepalive%2Clmt%2Cmime%2Cmm%2Cmn%2Cms%2Cmv%2Cpl%2Crequiressl%2Csource%2Cexpire&beids=%5B9466593%5D&ip=2001%3A4ca0%3A0%3Af296%3A37aa%3A6b63%3A4536%3Aae7&keepalive=yes&source=youtube&ms=au%2Conr&mv=m&mt=1523730855&mn=sn-4g5e6nsz%2Csn-i5heen7s&mm=31%2C26&requiressl=yes&clen=6934222&key=yt6&itag=251&ipbits=0&signature=B59598842AE5CA9323876AC4A071CA6047EF8F.7D966CFBEAFBEBE6AEDC32508ACF1CE5B9505AD0&expire=1523752550&dur=400.881&pl=32&gir=yes&c=WEB&mime=audio%2Fwebm&ratebypass=yes


In [6]:
# Get output video and audio filepaths
basename_fmt = '{}_{}_{}'.format(ytid, int(ts_start*1000), int(ts_end*1000))
audio_filepath = os.path.join('.', basename_fmt + '.' + audio_codec)

In [7]:
# Download the audio
audio_dl_args = [ffmpeg_path, 
    '-ss', str(ts_start),    # The beginning of the trim window
    '-i', best_audio_url,    # Specify the input video URL
    '-t', str(duration),     # Specify the duration of the output
    '-vn',                   # Suppress the video stream
    '-ac', '2',              # Set the number of channels
    '-y',                    # overwrite
    '-sample_fmt', 's16',    # Specify the bit depth
    #'-acodec', audio_codec,  # Specify the output encoding
    '-ar', '44100',          # Specify the audio sample rate
    audio_filepath]

proc = sp.Popen(audio_dl_args, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = proc.communicate()
if proc.returncode != 0:
    print(stderr)
else:
    print("Downloaded audio to " + audio_filepath)

Downloaded audio to ./---2_BBVHAA_30000_40000.wav


In [8]:
# Load the AudioSet training set
with open('../data/class_labels_indices.csv') as f:
    lines = f.readlines()

cl_list = [line.strip().split(',')[0:3] for line in lines[1:]]

In [9]:
index_dictionary = {b : c for a,b,c in cl_list}

In [10]:
file_labelling = {a[0]:[index_dictionary[c] for c in a[3:]] for a in dl_list}

In [11]:
file_labelling[dl_list[5][0]]

['"Boat', '"Vehicle"']

In [12]:
reverse_index_dictionary = {c:b for a,b,c in cl_list}

In [13]:
#file_labelling_reverse = {a:[x[0] for x in dl_list if reverse_index_dictionary[a] in x[3:]] for a in reverse_index_dictionary}

In [15]:
#file_labelling_reverse
file_labelling_reverse = {}
for x in dl_list:
    for y in file_labelling[x[0]]:
        if y in file_labelling_reverse:
            file_labelling_reverse[y].append(x[0])
        else:
            file_labelling_reverse[y] = [x[0]]

In [19]:
def dlfile(ytid, ts_start, ts_end):
    # Set output settings
    audio_codec = 'wav'
    audio_container = 'wav'
    video_codec = 'h264'
    video_container = 'mp4'
    ts_start, ts_end = float(ts_start), float(ts_end)
    duration = ts_end - ts_start

    # Get output video and audio filepaths
    basename_fmt = '{}_{}_{}'.format(ytid, int(ts_start*1000), int(ts_end*1000))
    audio_filepath = os.path.join('.', basename_fmt + '.' + audio_codec)
    # Download the audio
    
    # Get the URL to the video page
    video_page_url = 'https://www.youtube.com/watch?v={}'.format(ytid)

    # Get the direct URLs to the videos with best audio and with best video (with audio)
    video = pafy.new(video_page_url)
    
    best_audio = video.getbestaudio()
    best_audio_url = best_audio.url
    
    audio_dl_args = [ffmpeg_path, 
    '-ss', str(ts_start),    # The beginning of the trim window
    '-i', best_audio_url,    # Specify the input video URL
    '-t', str(duration),     # Specify the duration of the output
    '-vn',                   # Suppress the video stream
    '-ac', '2',              # Set the number of channels
    '-y',                    # overwrite
    '-sample_fmt', 's16',    # Specify the bit depth
    #'-acodec', audio_codec,  # Specify the output encoding
    '-ar', '44100',          # Specify the audio sample rate
    audio_filepath]

    proc = sp.Popen(audio_dl_args, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout, stderr = proc.communicate()
    if proc.returncode != 0:
        print(stderr)
    return audio_filepath

In [20]:
def dl_random_file():
    ytid, ts_start, ts_end = random.choice(dl_list)[:3]
    return dlfile(ytid, ts_start, ts_end), ytid, file_labelling[ytid]

In [21]:
dl_random_file()

('./HX_-gtGMjhs_30000_40000.wav', 'HX_-gtGMjhs', ['"Speech"'])