# Keyframe extraction using sound
We want to extract the start and stop times for our abilities using sound. So first we must translate all the videos in our dataset to `.wav` or audio files that can then be analyzed using `librosa`.

In [3]:
import os

import ffmpeg
import threading

os.makedirs("Data/Audios", exist_ok=True)

def run_ffmpeg_with_timeout(command_func, timeout):
    result = {}
    def target():
        try:
            command_func()
            result["success"] = True
        except Exception as e:
            result["error"] = e

    thread = threading.Thread(target=target)
    thread.start()
    thread.join(timeout)

    if thread.is_alive():
        raise TimeoutError("FFmpeg process timed out.")

    if "error" in result:
        raise result["error"]

def webm_to_wav(input_file, output_file, timeout=1):
    try:
        run_ffmpeg_with_timeout(
            lambda: (
                ffmpeg
                .input(input_file)
                .output(output_file, format='wav')
                .overwrite_output()
                .run(capture_stdout=True, capture_stderr=True)
            ),
            timeout
        )
        print(f"Converted {input_file} to {output_file}")
    except TimeoutError:
        print(f"Timeout: Conversion of {input_file} took longer than {timeout} seconds.")
    except ffmpeg.Error as e:
        print(f"Error occurred:\n{e.stderr.decode()}")


In [4]:
path = "Data/Videos/"

for file in os.listdir(path):
    input_file = os.path.join(path, file)
    if file.endswith(".webm"):
        output = input_file.replace(".webm", ".wav").replace("Videos", "Audios", 1)
        webm_to_wav(input_file, output)

Timeout: Conversion of Data/Videos/Akshan_W_.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Ashe_R_.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Ezreal_E.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Amumu_E_.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Braum_W.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Ashe_Passive_.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Anivia_Passive_.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Aatrox_W_.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Anivia_E_.webm took longer than 1 seconds.
Converted Data/Videos/Seraphine_Q.webm to Data/Audios/Seraphine_Q.wav
Timeout: Conversion of Data/Videos/Akali_W_.webm took longer than 1 seconds.
Timeout: Conversion of Data/Videos/Akshan_Q_.webm took longer than 1 seconds.
Converted Data/Videos/Diana_R.webm to Data/Audios/Diana_R.wav
Convert

In [5]:
import librosa
import numpy as np

# TODO: Test with some type of filter
def get_sound_start_end_with_noise_gate(audio_file, threshold_ratio=0.1, padding=0.1):
    """
    Returns start and end times of a sound using an adaptive noise gate.
    
    :param audio_file: Path to the audio file.
    :param threshold_ratio: Proportion of the max amplitude to consider as noise (e.g., 0.1 = 10%).
    :param padding: Time (in seconds) to extend start and end times for safety.
    """
    y, sr = librosa.load(audio_file)

    envelope = np.abs(y)
    noise_threshold = threshold_ratio * np.max(envelope)
    sound_indices = np.where(envelope > noise_threshold)[0]

    if len(sound_indices) == 0:
        # No significant sound above thresholds
        return None, None

    start_idx = sound_indices[0]
    end_idx = sound_indices[-1]

    start_time = max(0, librosa.samples_to_time(start_idx, sr=sr) - padding)
    end_time = librosa.samples_to_time(end_idx, sr=sr) + padding

    return start_time, end_time

In [6]:
AUDIO_PATH = "Data/Audios/"
threshold_ratio = 0.05

def get_sound_start_ends(audio_path, threshold_ratio):
    thresholds = {}
    for sound_file in os.listdir(audio_path):
        filename = os.path.join(audio_path, sound_file)
        start, end = get_sound_start_end_with_noise_gate(filename, threshold_ratio=threshold_ratio)
        if start is not None and end is not None:
            thresholds[sound_file.removesuffix('.wav')] = np.array([start, end])
    return thresholds

In [9]:
threshold_ratios = np.arange(0.1, 0.15, step=0.01)
audio_start_stops_by_threshold = []
for threshold_ratio in threshold_ratios:
    audio_start_stops_by_threshold.append(get_sound_start_ends(AUDIO_PATH, threshold_ratio))

ImportError: Numba needs NumPy 2.0 or less. Got NumPy 2.1.

In [129]:
average_audio_start_stops = {}
for key in audio_start_stops_by_threshold[0].keys():
    values = np.array([audio_start_stop[key] for audio_start_stop in audio_start_stops_by_threshold])
    average_audio_start_stops[key] = np.mean(values, axis=0)

average_audio_start_stops

{'Aatrox_I': array([1.02161451, 2.12857143]),
 'Ezreal_W': array([0.18619501, 3.3700771 ]),
 'Cho%27Gath_Q': array([2.58794558, 5.69720635]),
 'Rek%27Sai_R': array([ 0.89297052, 15.09980045]),
 'Annie_Q': array([0.30193197, 6.15759637]),
 'Seraphine_I': array([ 0.3785941 , 11.42092517]),
 'Amumu_E_': array([0.26556009, 2.1562449 ]),
 'Ekko_I': array([ 7.24837188, 11.8313288 ]),
 'Cho%27Gath_R': array([ 2.0568254 , 10.62470748]),
 'Cho%27Gath_E': array([ 1.58175964, 12.21933787]),
 'Ashe_R_': array([0.56698413, 3.95856689]),
 'Anivia_Passive_': array([ 0.08185034, 10.91654422]),
 'Ashe_Passive_': array([0.44441723, 8.48327438]),
 'Annie_E': array([0.46146939, 9.17856689]),
 'Annie_R': array([0.29657143, 6.69629025]),
 'Azir_I': array([1.01963719, 7.76175057]),
 'Anivia_R_': array([0.26936961, 8.11219955]),
 'Amumu_Q_': array([0.44006349, 1.57580952]),
 'Ezreal_Q': array([0.54224036, 6.2832381 ]),
 'Akali_W_': array([ 0.79597279, 12.2401542 ]),
 'Cho%27Gath_W': array([0.39096599, 4.12258

In [130]:
import statistics
import numpy as np

def analyze_global_error(audio_start_stops_by_threshold):
    keys = audio_start_stops_by_threshold[0].keys()    
    key_errors = {key: [] for key in keys}
    
    for key in keys:
        values = [d[key] for d in audio_start_stops_by_threshold]
        for i in range(len(values)):
            for j in range(i + 1, len(values)):
                key_errors[key].append(abs(np.subtract(values[i], values[j])).sum())
    
    key_errors = {key: statistics.mean(errors) for key, errors in key_errors.items()}
    overall_mae = statistics.mean(key_errors.values())
    
    return overall_mae, key_errors


In [133]:
overall_mae, mae_by_video = analyze_global_error(audio_start_stops_by_threshold)

# Extract those audios for which we are more or less sure - have little background noise for example
low_error_audio_start_stop = [key for key, value in mae_by_video.items() if value < 0.01]
low_error_audio_start_stop

['Seraphine_I',
 'Amumu_E_',
 'Cho%27Gath_E',
 'Amumu_Q_',
 'Quinn_I',
 'Akali_E_',
 'Elise_I',
 'Brand_Q',
 'Alistar_W_',
 'Akali_Q_',
 'Blitzcrank_E',
 'Ahri_QVideo',
 'Elise_W',
 'Anivia_W_',
 'Ahri_Q',
 'Akshan_Passive_',
 'Graves_I',
 'Cassiopeia_R',
 'Draven_R',
 'Aatrox_W_',
 'Seraphine_R']

In [137]:
"Braum_W", average_audio_start_stops["Braum_W"]

('Braum_W', array([0.62175057, 7.68696599]))