# Donders MML: Video Clipping and Video-Audio Alignment
![Donders MML LOGO.png](attachment:dc660b39-b1c5-41cc-b996-3d46c637f152.png)

### Info Documents 
Location Repository
Github Repository 
Jupyter Notebook


### Requirements
Please install the necessary packages in requirements.txt using pip install -r requirements.txt

In [2]:
import os             # Importing the os module which provides functions for interacting with the operating system
import pyxdf          # Importing pyxdf, a Python library for reading XDF files
import glob           # Importing the glob module which helps in finding files/directories with specific patterns
import pandas as pd   # Importing pandas library (abbreviated as pd), which is used for data manipulation and analysis
import numpy as np    # Importing numpy library (abbreviated as np), which is used for numerical computations
import wave           # Importing wave module for reading and writing WAV files (usually audio files) 
import struct         # Importing struct module which provides functions to convert between Python values and C structs
import math           # Importing math module which provides mathematical functions
import random         # Importing random module for generating random numbers
from scipy.io import wavfile  # Importing wavfile module from scipy.io (a library built on numpy), for reading and writing WAV files
import noisereduce as nr      # Importing noisereduce module for noise reduction in audio signals
import json            # Importing json module for working with JSON data
import cv2            # Importing OpenCV library for computer vision tasks
from moviepy.editor import (                # Importing various classes and functions from moviepy.editor module
                            VideoFileClip,  # Class for working with video files
                            AudioFileClip,  # Class for working with audio files
                            CompositeAudioClip)  # Class for composing audio clip
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip  # video  clipping fucntion 
from moviepy.video.io.VideoFileClip import VideoFileClip          # alternative video clipping function
import matplotlib.pyplot as plt                                   # Importing pyplot library to create figures and plot data 
from matplotlib.widgets import Slider  
import tkinter                                                    # GUI toolkit to open and save files
from tkinter import filedialog                                    # GUI toolkit to open and save files
import subprocess 
from tqdm.notebook import tqdm


print("Everything was imported succesfully") #as terminal

Everything was imported succesfully


In [None]:
# ------------ PATHS -----------------------------------------------------
input_video_folder      = './video_raw/'      # this folder should only contain the videos you want to process. 
output_video_folder_cut = './video_cut/'  

input_file_folder = './data_processed/'

input_folder_ = './data_raw/'         # input folder with the raw XDF files (relative path) 
output_folder = './data_processed/'  # output folder where the raw extracted data will be saved (relative path) 

print("Input folder =", os.path.abspath(input_folder))
print("Output folder =", os.path.abspath(output_folder))




# Dictionary to map file extensions to codecs
extension_to_codec = {
    '.mp4': 'libx264',
    '.avi': 'libxvid',
    '.mov': 'libx264',
    '.mkv': 'libx264',
    '.flv': 'flv',
    # Add more mappings as needed
                    }


# IF NEEDED: Create a dictionary mapping from old stream names to new stream names (# Edit and add more mappings as needed.) 
     # (This dictionary mapping is based both of the stream_names and stream_types because in our case we have 2 streams with the same name (but different types)) 
rename_dict = {
    ('MyWebcamFrameStream_1', 'frameNR'): 'Video_P2',
    ('MyWebcamFrameStream_2', 'frameNR'): 'Video_P1',
    ('Mic', 'voice'): 'Mic_P1',
    ('Mic_004', 'voice'): 'Mic_P2',
    ('OpenSignals', '00:07:80:8C:06:6A'): 'PLUX_P2',
    ('OpenSignals', '00:07:80:D8:A8:81'): 'PLUX_P1'
}


# -------------FUNCTIONS------------------------------------------------------------------------------------
# AUDIO: Creating a function named "to_audio" tht writes audio data (input) and transforms into a WAV file (output). 
def to_audio(fileloc, timeseries_name, samplerate = 16000, channels = 1):   
    """
    This function - named "to_audio" - writes audio data to a WAV file.
    It accepts the following parameters:
    - fileloc (str): Location to save the audio file.
    - timeseriestype (list): Audio data to be written into the file.
    - samplerate (int, optional): Sampling rate of the audio data. Defaults to 16000.
    - channels (int, optional): Number of audio channels (mono or stereo). Defaults to 1 (mono)
    """
    if 'Mic' in timeseries_name:  #Condition check that the timeseriestype belongs to the microphone.
            
        obj = wave.open(fileloc,'w')        # Opens audio file using the wave.open() function write mode ('w'). Assigns data it to the variable obj.
        obj.setnchannels(channels)          # Sets the number of channels in the audio file using obj.setnchannels(channels). Deafault 1 channel (mono).
        obj.setsampwidth(2)                 # Sets the sample width in bytes using obj.setsampwidth(2). The value '2' indicates 16-bit audio.
        obj.setframerate(float(samplerate)) # sets the frame rate of the audio file using obj.setframerate(float(samplerate)), where samplerate is provided as a parameter.
            
        for i in timeseries:                      # Loop to iterate over each time-point in the temeseries stream
            data = struct.pack('<h', int(i[0]))   # Converts the first value of the timeseries to an integer and packs it into a binary string (struck.pack()) according to the '<h' fromat (i.e., short integer (16 bits) in little-endian byte order)   
            obj.writeframesraw( data )            # Writes the packed binary data into an audio file using the wave function writeframesraw() from the wave library 
        obj.close()                               # Closes the audio file 

print("Function \"to_audio\" created sucesfully") 


# VIDEO: Creating a function named frame_to_time to convert frame number to time format 
def frame_to_time(frame, fps):
    """
    frame_to_time converts a given frame number to a time format (HH:MM:SS.SS) based on the frames per second (fps).
    Arguments:
        frame (int): The frame number to be converted.
        fps (float): The frames per second of the video.
    Returns:
        str: The time format as a string in the format "HH:MM:SS.SS".
    """
    seconds = frame / fps
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:.2f}"

print("Function \"frame_to_time\" created sucesfully") 



# Renaming XDF Stream (if necessary)
def rename_streams(streams, rename_dict):
    """
    Function rename_stream renames any streams based on the rename dictionary (if name found in remane_dict)
        Parameters:
    stream_name (str): The current name of the stream.
    stream_type (str): The type of the stream.
    rename_dict (dict): A dictionary mapping old stream names and types to new stream names.
        Returns:
    str: The new stream name if found in rename_dict, otherwise the original stream name.
    """
    for stream in streams:
        stream_name = stream['info']['name'][0]
        stream_type = stream['info']['type'][0]

        if (stream_name, stream_type) in rename_dict:
            new_name = rename_dict[(stream_name, stream_type)]
            print(f'Renaming stream {stream_name} ({stream_type}) to {new_name}')
            stream['info']['name'][0] = new_name  # Rename the stream
    return streams

print("Function \"rename_streams\" created sucesfully") 


# Function to clip the streams
def clip_nonmarker_streams(streams):
    """
    Function clip_nonmarker_streams cuts the start and end of all streams that are not "Markers" type in an XDF file based on the shortest stream.
    
    Input: 
        streams: list of streams to be clipped latest_start_time and earliest_end_time of all streams except Markers ones
    Output: 
        clipped streams based on the 
    """
    
    # Filter out marker streams for calculating the latest start and earliest end times
    non_marker_streams = [stream for stream in streams if stream['info']['type'][0] != 'Markers']

    for i in non_marker_streams:
        name_i = i['info']['name'][0]
        print(str(name_i))
    
    # Find the latest start time across all non-marker streams
    begintimes = [stream['time_stamps'][0] for stream in non_marker_streams]
    latest_start_time = max(begintimes)  # Get the first timestamp of each non-marker stream and find the maximum (latest start time)
    print('begin times per stream: ' + str(begintimes))
    print(' latest_start_time' + str(latest_start_time))

    # Find the earliest end time across all non-marker streams
    endtimes = [stream['time_stamps'][-1] for stream in non_marker_streams]
    earliest_end_time = min(endtimes)  # Get the last timestamp of each non-marker stream and find the minimum (earliest end time)
    print('end times per stream: ' + str(endtimes))
    print(' earliest_end_time' + str(earliest_end_time))

    clipped_streams = []  # Initialize an empty list to store the clipped streams

    for stream in streams:
        time_stamps = np.array(stream['time_stamps'])  # Convert the timestamps to a NumPy array
        time_series = np.array(stream['time_series'])  # Convert the time series data to a NumPy array

        # Find the index of the closest timestamp to the latest start time
        start_idx = np.searchsorted(time_stamps, latest_start_time, side='left')  # Get the index where the latest start time would fit
        # Ensure the index is within the valid range
        start_idx = max(0, min(start_idx, len(time_stamps) - 1))

        # Find the index of the closest timestamp to the earliest end time
        end_idx = np.searchsorted(time_stamps, earliest_end_time, side='right')  # Get the index where the earliest end time would fit
        # Ensure the index is within the valid range
        end_idx = max(0, min(end_idx, len(time_stamps)))

        print(f"Clipping stream {stream['info']['name'][0]}:")
        print(f" start_idx: {start_idx}, end_idx: {end_idx}")
        print(f" clipped_time_stamps: {time_stamps[start_idx:end_idx]}")

        # Clip the timestamps array to the range between the found indices
        clipped_time_stamps = time_stamps[start_idx:end_idx]  # Select the time stamps within the clipped range
        # Clip the time series data array to the same range
        clipped_time_series = time_series[start_idx:end_idx]  # Select the time series data within the clipped range

        # Create a copy of the original stream dictionary
        clipped_stream = stream.copy()  # Copy the stream dictionary
        # Replace the timestamps and time series data with the clipped versions
        clipped_stream['time_stamps'] = clipped_time_stamps  # Update the timestamps with the clipped data
        clipped_stream['time_series'] = clipped_time_series  # Update the time series with the clipped data

        clipped_streams.append(clipped_stream)  # Add the clipped stream to the list

    return clipped_streams  # Return the list of clipped streams

print("Function \"clip_nonmarker_streams\" created successfully")


# Function to extract specified events (with correspodning LSL times) from XDF stream (useful for plotting)
def get_events(stream, event_names):
    """
    Extracts events and corresponding LSL times from the given stream that match any of the event_names.

    Parameters:
    stream (dict): The stream containing time stamps and event data.
    event_names (list of str): List of event name substrings to look for in the events.

    Returns:
    np.array: An array where each row contains a timestamp and the full event name.
    """
    events = []  # Initialize an empty list to store matching events

    # Check if the stream type is "Markers"
    if stream['info']['type'][0] != "Markers":
        raise ValueError(f"ERROR: The stream provided ({stream['info']['name'][0]}) is not a Marker stream")

    # Iterate over the time stamps and corresponding events in the stream
    for timestamp, event in zip(stream['time_stamps'], stream['time_series']):
        # Check if any of the specified event names are in the current event
        for name in event_names:
            if name in event[0]:
                # If a match is found, append the timestamp and full event name to the list
                events.append([timestamp, event[0]])

    # Convert the list of events to a NumPy array and return it
    return np.array(events)

print("Function \"get_events\" created sucesfully") 