# Donders MML: XDF processing
The script processes all data files sotred in an XDF format. 
Steps: 
1. Import libraries necessary for processing audio, video and data files 
    a. See requirements.txt to conda install all the necessary packages. 
    
2. Identify XDF files within a specified directory or its subdirectories.

3. ... 


## 0. Import all the necessary packages to work with XDF, Audio and Video files 

In [1]:
import os  # Importing the os module which provides functions for interacting with the operating system
import pyxdf  # Importing pyxdf, a Python library for reading XDF files
import glob  # Importing the glob module which helps in finding files/directories with specific patterns
import pandas as pd  # Importing pandas library (abbreviated as pd), which is used for data manipulation and analysis
import numpy as np  # Importing numpy library (abbreviated as np), which is used for numerical computations
import wave  # Importing wave module for reading and writing WAV files (usually audio files) 
import struct  # Importing struct module which provides functions to convert between Python values and C structs
import math  # Importing math module which provides mathematical functions
import random  # Importing random module for generating random numbers
from scipy.io import wavfile  # Importing wavfile module from scipy.io (a library built on numpy), for reading and writing WAV files
import noisereduce as nr  # Importing noisereduce module for noise reduction in audio signals
import json  # Importing json module for working with JSON data
import cv2  # Importing OpenCV library for computer vision tasks
from moviepy.editor import (  # Importing various classes and functions from moviepy.editor module
                            VideoFileClip,  # Class for working with video files
                            AudioFileClip,  # Class for working with audio files
                            CompositeAudioClip)  # Class for composing audio clip
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip # video  clipping fucntion 
from moviepy.video.io.VideoFileClip import VideoFileClip # alternative video clipping function
import matplotlib.pyplot as plt  # Importing pyplot library to create figures and plot data 
from matplotlib.widgets import Slider  # 
import tkinter # GUI toolkit to open and save files
from tkinter import filedialog # GUI toolkit to open and save files
import subprocess 
#import ffmpeg     # Question about this
# import xdf

print("Everything was imported succesfully") #as terminal

Everything was imported succesfully


## 1. Define the Relevant Paths, Variables & Functions

In [2]:
# ------------ PATHS -----------------------------------------------------
input_folder = './data_raw/'  # input folder with the raw XDF files (relative path) 
output_folder = './data_processed/'  # output folder where the raw extracted data will be saved (relative path) 

print("Input folder =", os.path.abspath(input_folder))
print("Output folder =", os.path.abspath(output_folder))


# ------------ VARIABLES ----------------------------------------------
noise_reducelevel = 1.5  #This can be changed accordingly 


# Dictionary to map file extensions to codecs
extension_to_codec = {
    '.mp4': 'libx264',
    '.avi': 'libxvid',
    '.mov': 'libx264',
    '.mkv': 'libx264',
    '.flv': 'flv',
    # Add more mappings as needed
                    }


# IF NEEDED: Create a dictionary mapping from old stream names to new stream names (# Edit and add more mappings as needed.) 
     # (This dictionary mapping is based both of the stream_names and stream_types because in our case we have 2 streams with the same name (but different types)) 
rename_dict = {
    ('MyWebcamFrameStream_2', 'frameNR'): 'Video_P2',
    ('MyWebcamFrameStream_1', 'frameNR'): 'Video_P1',
    ('Mic', 'voice'): 'Mic_P1',
    ('Mic_004', 'voice'): 'Mic_P2',
    ('OpenSignals', '00:07:80:8C:06:6A'): 'PLUX_P2',
    ('OpenSignals', '00:07:80:D8:A8:81'): 'PLUX_P1'
}


# -------------FUNCTIONS------------------------------------------------------------------------------------
# AUDIO: Creating a function named "to_audio" tht writes audio data (input) and transforms into a WAV file (output). 
def to_audio(fileloc, timeseries_name, samplerate = 16000, channels = 1):   
    """
    This function - named "to_audio" - writes audio data to a WAV file.
    It accepts the following parameters:
    - fileloc (str): Location to save the audio file.
    - timeseriestype (list): Audio data to be written into the file.
    - samplerate (int, optional): Sampling rate of the audio data. Defaults to 16000.
    - channels (int, optional): Number of audio channels (mono or stereo). Defaults to 1 (mono)
    """
    if 'Mic' in timeseries_name:  #Condition check that the timeseriestype belongs to the microphone.
            
        obj = wave.open(fileloc,'w')        # Opens audio file using the wave.open() function write mode ('w'). Assigns data it to the variable obj.
        obj.setnchannels(channels)          # Sets the number of channels in the audio file using obj.setnchannels(channels). Deafault 1 channel (mono).
        obj.setsampwidth(2)                 # Sets the sample width in bytes using obj.setsampwidth(2). The value '2' indicates 16-bit audio.
        obj.setframerate(float(samplerate)) # sets the frame rate of the audio file using obj.setframerate(float(samplerate)), where samplerate is provided as a parameter.
            
        for i in timeseries:                      # Loop to iterate over each time-point in the temeseries stream
            data = struct.pack('<h', int(i[0]))   # Converts the first value of the timeseries to an integer and packs it into a binary string (struck.pack()) according to the '<h' fromat (i.e., short integer (16 bits) in little-endian byte order)   
            obj.writeframesraw( data )            # Writes the packed binary data into an audio file using the wave function writeframesraw() from the wave library 
        obj.close()                               # Closes the audio file 

print("Function \"to_audio\" created sucesfully") 


# VIDEO: Creating a function named frame_to_time to convert frame number to time format 
def frame_to_time(frame, fps):
    """
    frame_to_time converts a given frame number to a time format (HH:MM:SS.SS) based on the frames per second (fps).
    Arguments:
        frame (int): The frame number to be converted.
        fps (float): The frames per second of the video.
    Returns:
        str: The time format as a string in the format "HH:MM:SS.SS".
    """
    seconds = frame / fps
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:.2f}"

print("Function \"frame_to_time\" created sucesfully") 


# XDF Save 
def save_xdf(filename, streams, header):
    with open(filename, 'wb') as f:
        pyxdf.write_header(f, header)
        for stream in streams:
            pyxdf.write_stream_header(f, stream['info'])
            pyxdf.write_stream_data(f, stream['time_stamps'], stream['time_series'])

print("Function \"save_xdf\" created sucesfully") 


# Function to extract specified events (with correspodning LSL times) from XDF stream (useful for plotting)
def get_events(stream, event_names):
    """
    Extracts events and corresponding LSL times from the given stream that match any of the event_names.

    Parameters:
    stream (dict): The stream containing time stamps and event data.
    event_names (list of str): List of event name substrings to look for in the events.

    Returns:
    np.array: An array where each row contains a timestamp and the full event name.
    """
    events = []  # Initialize an empty list to store matching events

    # Check if the stream type is "Markers"
    if stream['info']['type'][0] != "Markers":
        raise ValueError(f"ERROR: The stream provided ({stream['info']['name'][0]}) is not a Marker stream")

    # Iterate over the time stamps and corresponding events in the stream
    for timestamp, event in zip(stream['time_stamps'], stream['time_series']):
        # Check if any of the specified event names are in the current event
        for name in event_names:
            if name in event[0]:
                # If a match is found, append the timestamp and full event name to the list
                events.append([timestamp, event[0]])

    # Convert the list of events to a NumPy array and return it
    return np.array(events)

print("Function \"get_events\" created sucesfully") 


Input folder = C:\Users\ahmar\OneDrive\Documents\GitHub\Mobile-Multimodal-Lab\2_PREPROCESSING\XDF_PROCESSING\data_raw
Output folder = C:\Users\ahmar\OneDrive\Documents\GitHub\Mobile-Multimodal-Lab\2_PREPROCESSING\XDF_PROCESSING\data_processed
Function "to_audio" created sucesfully
Function "frame_to_time" created sucesfully
Function "save_xdf" created sucesfully
Function "get_events" created sucesfully


## 2. Identifying XDF files in Input Folder or any Subfolder 

In [None]:
xdf_files = []  # Initialize an empty list to store paths of XDF files

# Traverse through the directory and its subdirectories to find XDF files
for root, dirs, files in os.walk(input_folder):  # 1st loop iterating over the results returned by os.walk().
    
    for file in files:                                    # 2nd loop iterating through each file in the current directory
        
        if file.endswith(".xdf"):                         # checking if the file has and XDF extension 
            
             xdf_files.append(os.path.join(root, file))   # if the file is an XDF file, append its full path to the xdf_files list
            
print('We have idenified the following XDF files: ' + str(xdf_files))

## 2a. Alternatively, the user can select their own XDF file

In [3]:
root = tkinter.Tk()
root.attributes('-topmost',True)
root.iconify()

xdf_files = filedialog.askopenfilename(title="Select an XDF file", filetypes=[("XDF Files", "*.xdf")], multiple = 'True')

root.destroy()

# Convert the tuple returned by askopenfilenames() to a list
xdf_files = list(xdf_files)

print('You have selected the following XDF files: ' + str(xdf_files))

You have selected the following XDF files: ['C:/Users/ahmar/OneDrive/Documents/GitHub/Mobile-Multimodal-Lab/2_PREPROCESSING/XDF_PROCESSING/data_raw/T1_experiment.xdf']


### X. Renaming XDF Streams (Edit as needed & Skip if not needed!) 

In [None]:
# In this section we remane the XDF streams within each XDF file and save the new files with the renamed streams in a different directory. Steps: 
# 1. Create a dictionary to map old names to new names.
# 2. Iterate over each stream in the XDF file.
# 3. Check if the stream name exists in the dictionary.
# 4. If it exists, replace it with the new name.
# 5. Saves the renamed.xdf in the specified output_folder 

output_folder = './data_raw_renamed/'  # input folder with the raw XDF files (relative path) 


# Create a dictionary mapping from old stream names to new stream names (# Edit and add more mappings as needed.) 
     # (This dictionary mapping is based both of the stream_names and stream_types because in our case we have 2 streams with the same name (but different types)) 

rename_dict = {
    ('MyWebcamFrameStream_2', 'frameNR'): 'Video_P2',
    ('MyWebcamFrameStream_1', 'frameNR'): 'Video_P1',
    ('Mic', 'voice'): 'Mic_P1',
    ('Mic_004', 'voice'): 'Mic_P2',
    ('OpenSignals', '00:07:80:8C:06:6A'): 'PLUX_P2',
    ('OpenSignals', '00:07:80:D8:A8:81'): 'PLUX_P1'
}

# Loading the XDF files: 
for xdf_file in xdf_files: 
    print('Loading XDF file: ' + xdf_file) 

    streams, header = pyxdf.load_xdf(xdf_file)  # Load streams and header information from each XDF file using the load_xdf function from the pyxdf library
    fnam = os.path.basename(xdf_file)[:-4]  # Extract the file name from the path and assign it to fnam, while removing the '.xdf' extension


    # Extracting "Name" and "Type" from the stream "info" 
    for stream in streams: 
        stream_name = stream['info']['name'][0] # Extract the name of each stream
        stream_type = stream['info']['type'][0] # Extract the type of each stream

        #Checking if the stream name and type against the dictonary. If it already exists, we replace it with the new name. 
        if (stream_name, stream_type) in rename_dict: 
            new_name = rename_dict[(stream_name, stream_type)]

            print(f'Renaming stream {stream_name} ({stream_type}) to {new_name}')
            stream['info']['name'][0] = new_name  # Rename the stream


            ## !!! HAVING PROBLEMS WITH ACTUALLY SAVING THE XDF FILES. NO SAVE FUNCTION PROVIDED BY THE XDFPY LIBRARY 
    # # Saving the renamed.xdf in the specified output_folder 
    # output_file_path = os.path.join(output_folder, fnam + '_renamed.xdf')
    # print("Saving the renamed xdf file using the save_xdf function")
    # save_xdf(output_file_path, streams, header)
    
    
    # # pyxdf.save_xdf(output_file_path, streams, header)  ## AttributeError: module 'pyxdf' has no attribute 'save_xdf'
    
    
    # print(f'Saved renamed XDF file to {output_file_path}')

print("Done renaming streams in all XDF files!")

    



# 3. (NEW) Main Loop that Extracts & RENAMES each data stream from each XDF and saves it as CVS or WAV files 

Make a function out of the renaming old to new streams in XDF 



In [None]:
output_folder = './data_processed_2/'  # output_folder for the CVS and WAV files 


for xdf_file in xdf_files:               # Iterate over each path in the list "xdf_files". 

    print('loading xdf file: ' + xdf_file )

    streams, header = pyxdf.load_xdf(xdf_file)    # Loading steams and header information from each XDF file usign the load_xdf function from the pyxdf library. 
    fnam = os.path.basename(xdf_file)[:-4]        # Extract the file name from the path and assings it to fnam, whilst removing the '.xdf' extension (i.e., the last 4 characters in the string)
       
    stream_count = {}   # Dictionary to keep track of multiple streams with the same name (this can happen in cases of multiple people) 

    # Navigating through each stream and extracting the relevant information (e.g., name, sample rate, data, time, etc.) 
    for stream in streams:                        # Iterate over each steam in the loaded steam for each XDF file .
        timeseries_name = stream['info']['name'][0]                    # Extracts name of each steams (dictionary structure) and assigns it to timeseries_name (e.g., Mic). 
        timeseries_type = stream['info']['type'][0]

        # Replacing old stream name with new one if it already exists in the raname_dict. 
        if (timeseries_name, timeseries_type) in rename_dict: 
            new_name = rename_dict[(timeseries_name, timeseries_type)]

            print(f'Renaming stream {timeseries_name} ({timeseries_type}) to {new_name}')
            stream['info']['name'][0] = new_name  # Rename the stream in the XDF file 
            
            timeseries_name = stream['info']['name'][0]           #Overrinding old name with new name.  
        
        timevec = stream['time_stamps']                                # Extract the time_stamps (i.e.,LSL TIMESTAMPS) from the "stream" dictionary and assigns it to the variable timevec
        timeseries = stream['time_series']                              # Extract the time_series (i.e., DATA) from the "stream" dictionary and 
        samplerate = round(float(stream['info']['nominal_srate'][0]))    # Extracts the rounded sampling rate (nominal_srate) and assings it to samplerate 
        channelcount = int(stream['info']['channel_count'][0])               # Extracts the number of channel for each steams and assigns it to channelcount (as an integer)

        # Extract channel labels
        channel_labels = []
        if channelcount > 1 and stream['info']['desc'] is not None:
            try:
                channels_info = stream['info']['desc'][0]['channels'][0]['channel']
                channel_labels = [channel['label'][0] for channel in channels_info]
            except (KeyError, IndexError):
                # If there is any issue with extracting channel labels, create default labels
                channel_labels = [f"Channel {i+1}" for i in range(channelcount)]
        else:
            # If channelcount is 1 and/or'desc' is None, create default labels
            channel_labels = [f"Channel {i+1}" for i in range(channelcount)]
       
        
        print('working on stream: ' + timeseries_name + '  with a channel count of: ' + str(channelcount) + ' labelled: ' + str(channel_labels) + ' and a sampling rate of ' + str(samplerate))        

        # Saving each stream of the XDF file as a CSV (if it doesn't exist yet) 
        matrix_aux = np.vstack([np.transpose(timevec),np.transpose(timeseries)])   # Create matrix_aux by concatenating the transposed timevec and timeseries
        matrix     = np.transpose(matrix_aux)                                      # Create new matrix by tranposing matrix_aux
        column_names = ["LSL_Time"] + [f"{timeseries_name}_{label}" for label in channel_labels]  # Create column names for the matrix. The first column is named "LSL_Time" (i.e., timevec), The subsequent columns are named using the format {timeseries_name}_{label} for each channel label. 
        df_lab = pd.DataFrame(matrix, columns = column_names)                                     # Create a DataFrame df_lab with the combined data and the appropriate column names.
       
        # Saving 
        print('Saving: ' + fnam + '_' + timeseries_name) 
        df_lab.to_csv(output_folder + fnam + '_' + timeseries_name + '.csv',index=False)  
      

        if "Mic" in timeseries_name:  # Check if the data stream is from a microphone
            wavloc = os.path.abspath(output_folder + fnam + '_' + timeseries_name  + '.wav')  # Define the location to save the initial audio file
            to_audio(wavloc, timeseries_name)  # Convert the time series data to an audio file and save it at the defined location
            rate, data = wavfile.read(wavloc)  # Load the audio data from the saved WAV file
            reduced_noise = nr.reduce_noise(y=data, sr=rate, n_std_thresh_stationary=noise_reducelevel, stationary=True)    # Perform noise reduction based on the noise_reducelevel 
            wavloc2 = os.path.abspath(output_folder + fnam + '_' + timeseries_name + '_denoised.wav')  # Define the location to save the noise-reduced audio file
            wavfile.write(wavloc2, rate, reduced_noise)  # Save the noise-reduced audio data as a new WAV file at the defined location

print("Done with extracting all the streams! You can now look into your folder: " + output_folder)

# 3. Main Loop that Extracts each data stream from each XDF file and saves as CVS or WAV file

In [None]:
output_folder = './data_processed/'  # input folder with the raw XDF files (relative path) 
event_names = ['_StartParticipantSinging', '_EndParticipantSinging']


for xdf_file in xdf_files:               # Iterate over each path in the list "xdf_files". 

    print('loading xdf file: ' + xdf_file )

    streams, header = pyxdf.load_xdf(xdf_file)    # Loading steams and header information from each XDF file usign the load_xdf function from the pyxdf library. 
    fnam = os.path.basename(xdf_file)[:-4]        # Extract the file name from the path and assings it to fnam, whilst removing the '.xdf' extension (i.e., the last 4 characters in the string)
       
    stream_count = {}   # Dictionary to keep track of multiple streams with the same name (this can happen in cases of multiple people) 

    # Navigating through each stream and extracting the relevant information (e.g., name, sample rate, data, time, etc.) 
    for stream in streams:                        # Iterate over each steam in the loaded steam for each XDF file .
        timeseries_name = stream['info']['name'][0]                    # Extracts name of each steams (dictionary structure) and assigns it to timeseriestype (e.g., Mic). 
        timevec = stream['time_stamps']                                # Extract the time_stamps (i.e.,LSL TIMESTAMPS) from the "stream" dictionary and assigns it to the variable timevec
        timeseries = stream['time_series']                              # Extract the time_series (i.e., DATA) from the "stream" dictionary and 
        samplerate = round(float(stream['info']['nominal_srate'][0]))    # Extracts the rounded sampling rate (nominal_srate) and assings it to samplerate 
        channelcount = int(stream['info']['channel_count'][0])               # Extracts the number of channel for each steams and assigns it to channelcount (as an integer)

        # Extract channel labels
        channel_labels = []
        if channelcount > 1 and stream['info']['desc'] is not None:
            try:
                channels_info = stream['info']['desc'][0]['channels'][0]['channel']
                channel_labels = [channel['label'][0] for channel in channels_info]
            except (KeyError, IndexError):
                # If there is any issue with extracting channel labels, create default labels
                channel_labels = [f"Channel {i+1}" for i in range(channelcount)]
        else:
            # If channelcount is 1 and/or'desc' is None, create default labels
            channel_labels = [f"Channel {i+1}" for i in range(channelcount)]
       
        # Incrementing the count if the name of two (or more) streams are the same. 
        if timeseries_name in stream_count:
            stream_count[timeseries_name] += 1
        else:
            stream_count[timeseries_name] = 1 
        count = stream_count[timeseries_name]  # Get the current count for the stream type
        
        print('working on stream: ' + timeseries_name + '  with a channel count of: ' + str(channelcount) + ' labelled: ' + str(channel_labels) + ' and a sampling rate of ' + str(samplerate))        

        # Saving each stream of the XDF file as a CSV (if it doesn't exist yet) 
        matrix_aux = np.vstack([np.transpose(timevec),np.transpose(timeseries)])   # Create matrix_aux by concatenating the transposed timevec and timeseries
        matrix     = np.transpose(matrix_aux)                                      # Create new matrix by tranposing matrix_aux
        column_names = ["LSL_Time"] + [f"{timeseries_name}_{label}" for label in channel_labels]  # Create column names for the matrix. The first column is named "LSL_Time" (i.e., timevec), The subsequent columns are named using the format {timeseries_name}_{label} for each channel label. 
        df_lab = pd.DataFrame(matrix, columns = column_names)                                     # Create a DataFrame df_lab with the combined data and the appropriate column names.
        if count == 1: # Saving without adding count to the file name
            df_lab.to_csv(output_folder + fnam + '_' + timeseries_name + '_nominal_srate' + str(samplerate)  + '.csv',index=False)  
        else:         # Saving by adding count ot the file name (because of multiple streams with same name) 
            df_lab.to_csv(output_folder + fnam + '_' + timeseries_name + '_nominal_srate' + str(samplerate) + '_' + str(count) + '.csv',index=False)     #Saving the df_lab as CSV file named [fnam]_[timeseriestype]_[nomilar_srate#].cvs
       

        if "Mic" in timeseries_name:  # Check if the data stream is from a microphone
            wavloc = os.path.abspath(output_folder + fnam + '_' + timeseries_name + '_nominal_srate' + str(samplerate) + '.wav')  # Define the location to save the initial audio file
            to_audio(wavloc, timeseries_name)  # Convert the time series data to an audio file and save it at the defined location
            rate, data = wavfile.read(wavloc)  # Load the audio data from the saved WAV file
            reduced_noise = nr.reduce_noise(y=data, sr=rate, n_std_thresh_stationary=noise_reducelevel, stationary=True)    # Perform noise reduction based on the noise_reducelevel 
            wavloc2 = os.path.abspath(output_folder + fnam + '_' + timeseries_name + '_nominal_srate' + str(samplerate) + '_denoised.wav')  # Define the location to save the noise-reduced audio file
            wavfile.write(wavloc2, rate, reduced_noise)  # Save the noise-reduced audio data as a new WAV file at the defined location

print("Done with extracting all the streams! You can now look into your folder: " + output_folder)

## 4. Plotting Each XDF and Each Stream (Quality Check)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

event_names = ['_StartParticipantSinging', '_EndParticipantSinging']


for xdf_file in xdf_files:               # Iterate over each path in the list "xdf_files". 

    print('loading xdf file: ' + xdf_file )

    streams, header = pyxdf.load_xdf(xdf_file)    # Loading steams and header information from each XDF file usign the load_xdf function from the pyxdf library. 
    fnam = os.path.basename(xdf_file)[:-4]        # Extract the file name from the path and assings it to fnam, whilst removing the '.xdf' extension (i.e., the last 4 characters in the string)

    # Navigating through each stream and extracting the relevant information (e.g., name, sample rate, data, time, etc.) 
    print('Navigating through each stream: ') 
    for stream in streams:                        # Iterate over each steam in the loaded steam for each XDF file .
        #Extracting information from the Stream dictionary 
        timeseries_name = stream['info']['name'][0]             # Extracts name of each steams (dictionary structure) and assigns it to timeseries_name (e.g., Mic).
        timeseries_type = stream['info']['type'][0]             # Extracts type of each steams (dictionary structure) and assigns it to timeseries_type (e.g., Markers). 

        # Replacing old stream name with new one if it already exists in the raname_dict. 
        if (timeseries_name, timeseries_type) in rename_dict: 
            new_name = rename_dict[(timeseries_name, timeseries_type)]

            print(f'Renaming stream {timeseries_name} ({timeseries_type}) to {new_name}')
            stream['info']['name'][0] = new_name  # Rename the stream in the XDF file 
            
            timeseries_name = stream['info']['name'][0]           #Overrinding old name with new name.  
            
        #Extracting information from the Stream dictionary 
        timevec = stream['time_stamps']                                # Extract the time_stamps (i.e.,LSL TIMESTAMPS) from the "stream" dictionary and assigns it to the variable timevec
        timeseries = np.array(stream['time_series'])                   # Extract the time_series (i.e., DATA) from the "stream" dictionary and 
        samplerate = round(float(stream['info']['nominal_srate'][0]))  # Extracts the rounded sampling rate (samplerate) and assings it to samplerate 
        channelcount = int(stream['info']['channel_count'][0])         # Extracts the number of channel for each steams and assigns it to channelcount (as an integer)

        # Extract channel labels from the Stream dictionary 
        channel_labels = []
        
        if channelcount > 1 and stream['info']['desc'] is not None:
            try:
                channels_info = stream['info']['desc'][0]['channels'][0]['channel']
                channel_labels = [channel['label'][0] for channel in channels_info]
            except (KeyError, IndexError):
                # If there is any issue with extracting channel labels, create default labels
                channel_labels = [f"Channel {i+1}" for i in range(channelcount)]
        else:
            # If channelcount is 1 and/or'desc' is None, create default labels
            channel_labels = [f"Channel {i+1}" for i in range(channelcount)]           


        print('working on stream: ' + timeseries_name + '  with a channel count of: ' + str(channelcount) + ' labelled: ' + str(channel_labels) + '\n and a sampling rate of ' + str(samplerate))        


        # Extracting Relevant Events from Marker Steam that will used in the plotting 
        if timeseries_type == 'Markers':
            audio_events_stream = get_events(stream, event_names)  #Using the get_events function to extracts events and corresponding LSL times from Marker stream

            continue    # Skips plotting for Markers stream 
            
     
        
        #--------------- PLOTTING -----------
        timeseries = np.array(timeseries)   # Ensure timeseries is a NumPy array for plotting

        # Plotting subplots in case of multiple streams 
        # PLOTTING 
        timeseries = np.array(timeseries)   # Ensure timeseries is a NumPy array

        # Plotting subplots in case of multiple streams 
        if channelcount > 1:
            fig = make_subplots(rows=channelcount, cols=1, shared_xaxes=True, vertical_spacing=0.01, subplot_titles=channel_labels)
        
            for i in range(channelcount):
                fig.add_trace(go.Scatter(
                    x=timevec, 
                    y=timeseries[:, i],
                    mode='lines',
                    name=channel_labels[i]
                ), row=i+1, col=1)
        
            # Update x-axis settings for all subplots
            for i in range(channelcount):
                fig.update_xaxes(
                    rangeselector=dict(
                        buttons=list([
                            dict(count=1,
                                 label="1m",
                                 step="minute",
                                 stepmode="backward"),
                            dict(count=10,
                                 label="10m",
                                 step="minute",
                                 stepmode="backward"),
                            dict(step="all")
                        ])
                    ),
                    rangeslider=dict(
                        visible=True
                    ) if i == channelcount-1 else None,  # Apply range slider only to the last subplot
                    type="linear",
                    row=i+1, col=1
                )
        
            fig.update_layout(
                title_text=fnam + '_' + timeseries_name + ' Streams',
                height=200 * channelcount,
                yaxis_title='Amplitude'
            )
        
            fig.show()


        else:
            # Plotting single channel data stream
            fig = go.Figure()
            timeseries = np.array(timeseries)  # Ensure timeseries is a NumPy array

            fig.add_trace(go.Scatter(
                x=timevec,
                y=timeseries[:, 0],
                mode='lines',
                name=channel_labels[0]
            ))

            fig.update_layout(
                title_text=fnam + '_' + timeseries_name + ' Streams',
                xaxis=dict(
                    rangeselector=dict(
                        buttons=list([
                            dict(count=1,
                                 label="1m",
                                 step="minute",
                                 stepmode="backward"),
                            dict(count=10,
                                 label="10m",
                                 step="minute",
                                 stepmode="backward"),
                            dict(step="all")
                        ])
                    ),
                    rangeslider=dict(
                        visible=True
                    ),
                    type="linear"
                ),
                yaxis_title='Amplitude'
            )

            fig.show()




            # Plotting Events (i.e., triggers) 
        # if stream['info']['channel_format'][0] == 'string':
        #     plt.figure(figsize=(12, 6))
        #     y_pos = np.arange(len(timevec))  # Create a y-position for each event to avoid overlap
        #     plt.scatter(timevec, [1] * len(timevec), marker='o')
        #     for i, event in enumerate(timeseries):
        #         plt.text(timevec[i], 1.01, event[0], rotation=45, ha='right', va='bottom', fontsize=6)
        #     plt.title(fnam + '_' + timeseries_type + ' Events')
        #     plt.xlabel('Time')
        #     plt.yticks([])
        #     plt.grid(True)
        #     plt.show()


        #     else:
        #     # Plotting data streams with Plotly subplots if multiple channels
        



            
            # # Plotting data streams with subplots and sliders
            # fig, axs = plt.subplots(channelcount, 1, figsize=(12, 6 * channelcount), sharex=True)
            # timeseries = np.array(timeseries)
            # if channelcount == 1:
            #     axs = [axs]  # Ensure axs is always a list

            # for i in range(channelcount):
            #     axs[i].plot(timevec, timeseries[:, i], label=channel_labels[i])
            #     axs[i].set_title(f'{channel_labels[i]}')
            #     axs[i].set_ylabel('Amplitude')
            #     axs[i].grid(True)
            #     axs[i].legend()

            # axs[-1].set_xlabel('Time')

            # # Adding slider for each subplot
            # sliders = []
            # axcolor = 'lightgoldenrodyellow'
            # for i in range(channelcount):
            #     ax_slider = plt.axes([0.25, 0.02 + i * 0.04, 0.65, 0.03], facecolor=axcolor)
            #     slider = Slider(ax_slider, 'Range', timevec[0], timevec[-1], valinit=timevec[-1])
            #     sliders.append(slider)

            #     def update(val, ax=axs[i], slider=slider):
            #         pos = slider.val
            #         ax.set_xlim(timevec[0], pos)
            #         fig.canvas.draw_idle()

            #     slider.on_changed(update)

            # plt.tight_layout()
            # plt.show()
             
        # else:
        #         # Plotting data streams 
        #     plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
        #     timeseries = np.array(timeseries)
        #     for i in range(channelcount):
        #         plt.plot(timevec, timeseries[:, i], label=channel_labels[i])
        #     plt.title(fnam + '_' + timeseries_type + ' Streams')
        #     plt.xlabel('Time')
        #     plt.ylabel('Amplitude')
        #     plt.legend()
        #     plt.grid(True)
        #     plt.show()
        ## -------------------------------------------------

Add a plotting/quality check passage to see each stream and each channel 

For Opensignals, maybe add subpanels to change the y axis. 
    subplots with total height of bigger plot. 

    Envision box plotly (selecting, smoothing and deriving measures) --> Use the slider range as well 

Naming the files. 
    In the lab setup, we will publish the scripts that stream the data to Lab recorder. 
    Each stream should be named accordingly 
    For now, we can change the actual stream names in the xdf file manually by using pyXDF (the same names will be used). 

Creating saving xdf folders for each session. Name of files will be Session_X_P_X_streamname



Downsampling the videos 

go into timevec (original fps is about 200 with LSL)
indent every 1/5 and extract the timevec (LSL time) and timeseries data 
Put into a n

## 4. Clipping videos 

In [None]:
input_video_folder = './video_raw/'    #this folder should only contain the videos you want to process. 
output_video_folder = './video_cut/'
input_file_folder = './data_processed/'

# List of participant file pairs       #--------------- SOULD WE DO THIS ANOTHER WAY? MAYBE SAVING THE VSC FILES WITH A CONSISTENT NAME? ---- 
participants_files = [
    ('MyWebcamFrameStream_1', 'P1'),
    ('MyWebcamFrameStream_2', 'P2')
]


# Loading the relevant CVS files (called here 'MyWebcamFrameStream_x') that contain the LSL_time stamps and correspnding video frames
for participant_frame, participant_video in participants_files:
    
    # Loading the relevant CSV files for each participant
    for file in os.listdir(os.path.abspath(input_file_folder)):
        
        if participant_frame in file:  # Check for the corresponding participant
            
            print(f'Processing {participant_frame} for {participant_video}: {file}')
            
            file_path = os.path.join(os.path.abspath(input_file_folder), file)
            print(f'This is the file path: {file_path}')

            # Loading the CSV file
            file_data = pd.read_csv(file_path) # Reads the CSV file at the constructed path into a DataFrame called file_data        

            # Extracting relevant information from this CSV file 
            LSL_begin_time = file_data.iloc[:,0].min()       # Extracts the minimum value from the first column in the file_data (i.e., the first LSL_timestamps). 
            LSL_begin_frame = int(file_data.iloc[:,1].min())  # Extracts the minimum value from the second column in the file_data (i.e., the first video frame)
            LSL_end_time = file_data.iloc[:,0].max()          # Extracts the maximum value from the first column in the file_data (i.e., the last LSL_timestamps).
            LSL_end_frame = int(file_data.iloc[:,1].max())    # Extracts the maximum value from the second column in the file_data (i.e., the last video frame).

            # print(LSL_begin_time )
            # print(LSL_begin_frame)
            # print(LSL_end_time)
            # print(LSL_end_frame)

            LSL_tot_frames = LSL_end_frame - LSL_begin_frame    # Total number of frames from start to finish recording.
            LSL_frames = range(LSL_begin_frame , LSL_end_frame)  # Sequence of all frames numbered from the start to the end. 
            'Add comment about Theoretical understanding of frame numbers and LSL fps streaming' 
            
            LSL_fps = round((LSL_tot_frames / (LSL_end_time - LSL_begin_time)), 3)


            new_range = (LSL_begin_time : 1/50 : LSL_end_time) 
            print(new_range) 

            framelist = []
            for i in new_range: 
                index_time = which.min(file_data.iloc[:,0]%%i)
                index_frame = file_data.iloc[index_time:1])
                framelist.append(index_frame) 
                
            print('LSL_tot_frames: ' + str(LSL_tot_frames))
            print('LSL frames per second: ' + str(LSL_fps))
            'Is this LSL_fps variable??' 

        
            # ----------------------------------------
            # Loading the original videos corresponding to the CSV files that will be cut according to LSL start and ends
            for video in os.listdir(os.path.abspath(input_video_folder)):
                
                if participant_video in video:  # Check for the corresponding participant  
                    
                    print('Now processing video: ' + video)
                    
                    video_filepath = os.path.join(os.path.abspath(input_video_folder), video)
                    capture = cv2.VideoCapture(video_filepath)  
                
                    # Extracting relevant meta-data about the video using CV2
                    video_frame_width  = capture.get(cv2.CAP_PROP_FRAME_WIDTH)  
                    video_frame_height = capture.get(cv2.CAP_PROP_FRAME_HEIGHT)  
                    video_frame_rate   = capture.get(cv2.CAP_PROP_FPS)
                    video_tot_frames   = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) 
                    capture.release()
                    print('video_tot_frames: ' +  str(video_tot_frames))
                    print('Video frames per second: ' + str(video_frame_rate))
        
        
                    ## ---------- Cutting video Using ffmpeg 
                    # Converting the LSL frames to the video time format to find start_cut and end_cut for the video
                    start_cut_time = frame_to_time(LSL_begin_frame, video_frame_rate)
                    end_cut_time = frame_to_time(LSL_end_frame, video_frame_rate)
        
                    print('Now cutting the video...')
        
                    # Determine the file extension and codec
                    file_extension = os.path.splitext(video)[1].lower()
                    codec = extension_to_codec.get(file_extension, 'libx264')  # Default to libx264 if not found
        
                    # Construct output file path with the same extension
                    output_filename = f'cut_{os.path.splitext(video)[0]}{file_extension}'
                    output_filepath = os.path.join(output_video_folder, output_filename)
        
                    # Use ffmpeg to cut the video
                    ffmpeg_command = [
                        'ffmpeg',
                        '-y',                  # Add -y flag to overwrite any existing files with the same name
                        '-i', video_filepath,
                        '-ss', start_cut_time,  # start time
                        '-to', end_cut_time,    # end time
                        '-c', 'copy',           # copy codec (no re-encoding)
                        output_filepath
                    ]
        
                    # Execute the command
                    subprocess.run(ffmpeg_command, check=True)
        
                    print(f'Video saved as {output_filepath}')
            

print("Done with cutting all videos! You can now look into your folder: " + output_video_folder)



            ## -------------- Usign moviepy does not work!! Why??
            
            # # Extract video frame rate using moviepy
            # video_clip = VideoFileClip(video_filepath)
            # video_frame_rate = video_clip.fps
            # print('Video frames per second: ' + str(video_frame_rate))
            

            # # Converting the LSL frames to the video time format to find start_cut and end_cut for the video 
            # start_cut_time = LSL_begin_frame / video_frame_rate
            # end_cut_time   = LSL_end_frame / video_frame_rate

            # print('Now cutting the video...') 

            # # Determine the file extension and codec
            # file_extension = os.path.splitext(video)[1].lower()
            # codec = extension_to_codec.get(file_extension)

            # if codec is None:
            #     print(f"Unsupported file extension: {file_extension}. Skipping file.")
            #     continue

            # # Cut and save the video using MoviePy
            # cut_clip = video_clip.subclip(start_cut_time, end_cut_time)

            # # Construct output file path with the same extension
            # output_filename = f'cut_{os.path.splitext(video)[0]}{file_extension}'
            # output_filepath = os.path.join(output_video_folder, output_filename)

            # # Save the cut video with the determined codec in specified location
            # cut_clip.write_videofile(output_filepath, codec=codec)

            # print(f'Video saved as {output_filepath}')


            # # -------------- This other way does not work either 

        #     # Translating the start and end time points to cut the video by multiplying the duration of each video frame by the 
        #     video_start_frametime_LSL =  np.round((1/int(frate)) * int(begin_frame), 3)
        #     video_end_frametime_LSL = np.round((1/int(frate)) * int(end_frame), 3)


        #     video_cut = VideoFileClip(video_filepath).cutout(video_start_frametime, video_end_frametime)
        #     video_cut.write_videofile(videofolder + "test.mp4")



        ## ---------------- This way takes a very long time 

        #   # Start Writing the Video 
            # fourcc = cv2.VideoWriter_fourcc(*'M', 'J', 'P', 'G')  # For different video formats you could use e.g., *'XVID'
            # vidloc = os.path.join(videofolder, f'{video.split(".")[0]}_cut.mp4')  # Location to save the new video
            # out = cv2.VideoWriter(vidloc, fourcc, fps=originalfps, frameSize=(int(frameWidth), int(frameHeight)))
            # frame_count = 0

            # print('Looping over the frames')
        
            # while capture.isOpened():
            #     # Read the next frame
            #     ret, frame = capture.read()
            #     if ret:
            #         # Increment the frame count
            #         frame_count += 1
            #         print(frame_count)
            #         if frame_count in frames:
            #             out.write(frame)
            #         if frame_count > end_frame:
            #             break

            # capture.release()
            # out.release()
            # print(f'Video saved to {vidloc}')

        

# 5. Concatenate (cut) Videos with Audios

In [None]:

input_audio_folder = './data_processed/' 
input_video_folder = './video_cut/'
output_audiovideo = './audiovideo_sync/'

# List of participant file pairs       #--------------- SOULD WE DO THIS ANOTHER WAY? MAYBE SAVING THE VSC FILES WITH A CONSISTENT NAME? ---- 
participants_files = [
    ('Mic_nominal_srate16000_denoised', 'P1'),
    ('Mic_004_nominal_srate16000_denoised', 'P2')
]


# loop over Audio files
for participant_audio, participant_video in participants_files:
    
    print('Navigating in the input audio folder: ')
    
    for audio in os.listdir(os.path.abspath(input_audio_folder)):
        print(audio)
        
        if participant_audio in audio:    #Participant Check 
            print('Now processing audio '+ audio)
        
            # Creating audio path 
            print('Loading the audio ')
            audio_path = os.path.join(os.path.abspath(input_audio_folder), audio)
            print(audio_path)
        
        
            # Loop over video files to select relevant video 
            print('Navigating in the input video folder: ')
            for video in os.listdir(os.path.abspath(input_video_folder)):
                print(video) 
                
                if participant_video in video:
                    print('Now processing video file ' + video) 
        
                    # Creating video path 
                    print('Loading the video ')
                    video_path = os.path.join(os.path.abspath(input_video_folder), video)
                    print(video_path)
        
        
                    # --- Combining Audio and Video using ffmpeg 
                    output_path = os.path.abspath(os.path.join(output_audiovideo + str(participant_video) + '_audiovideo_sync.avi'))
                    print(output_path)
        
                    # Construct the ffmpeg command
                    ffmpeg_command = [
                        'ffmpeg',
                        '-y',             #override
                        '-i', video_path,
                        '-i', audio_path,
                        '-c:v', 'copy',  # Copy the video codec
                        '-c:a', 'aac',   # Encode audio to AAC
                        '-strict', 'experimental',
                        output_path
                                    ]     
                    
                    # Run the ffmpeg command
                    print('Combining Audio and Video')
                    try:
                        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
                        print(result.stdout)
                        print(f'Video saved as {output_path}')
                    except subprocess.CalledProcessError as e:
                        print(f"Error combining audio and video {video_path} and {audio_path}: {e.stderr}")
                    
print('Done, you can now look into the folder. ' + output_audiovideo) 

In [None]:

audio_path = os.path.join(os.path.abspath(input_audio_folder), file)
                print(audio_path)
        
                
                if not os.path.exists(audio_path):
                    print(f"Audio file not found: {audio_path}" + '/n please check your foler to make sure the audio is there')
                    
                # input the video with ffmpg
                input_audio = ffmpeg.input(audio_path)
                print(input_audio)
        
                0/0

#load in the video with matchich trialIndex and SessionIndex
        print('Loading the video')
        video_path = os.path.join(trialfolder, f"{sessionIndex}_trial_{trialIndex}_video_raw.mp4")
        if not os.path.exists(video_path):
            print(f"Video file not found: {video_path}")
        input_video = ffmpeg.input(video_path)
        print(input_video)
        # get information about the vid_frate
        #streamloc = trialfolder+sessionIndex+'_trial_'+ str(trialIndex) +'_'+'MyWebcamFrameStream_nominal_srate500'+'.csv'
        #print(streamloc)
        #streamdata = pd.read_csv(streamloc)
        # get the begin and end frame
        #begfr = streamdata['1'].min().astype(int)
        #print(begfr)
        #endfr = streamdata['1'].max().astype(int)
        #print(endfr)
        #totfr = endfr-begfr
        #print(totfr)
        #begin = streamdata['0'].min()
        #print(begin)
        #end = streamdata['0'].max()
        #print(end)
        # what is the original fps of the video
        #origfps = round((totfr/(end-begin)),3)
        # tranform it into real number
        #origfps = float(origfps)
        
        #combine the audio and video
        print('Combining audio and video')
        output_path = os.path.abspath(os.path.join(trialfolder, f"{sessionIndex}_trial_{trialIndex}_final.mp4"))
        ffmpeg.concat(input_video, input_audio, v=1, a=1).output(output_path).run(overwrite_output=True)
        
        #save it
        print('Saving the video')
        #print(origfps)
        #print(type(origfps))
        #print(trialIndex)
        #print(sessionIndex)
        # save the final video with audio
        #final.write_videofile(trialfolder+sessionIndex+'_trial_'+ str(trialIndex) +'_'+'video_audio'+'.mp4', fps=origfps)

In [None]:
wavloc = os.path.join(os.path.abspath('./data_processed/')

if not os.path.exists(wavloc):
    print(f"Directory not found: {wavloc}")


# loop over Audio files
for file in os.listdir(wavloc):
    print(file)
    if 'Mic_nominal_srate16000_denoised' in file:
        print('Now processing file '+file)
        sessionIndex = file.split('_')[0]   # this is session number
        trialIndex = file.split('_')[2] # this is trial number
        #load in the audio
        print('Loading the audio')
        audio_path = os.path.join(wavloc, file)
        if not os.path.exists(audio_path):
            print(f"Audio file not found: {audio_path}")
        # input the video with ffmpg
        input_audio = ffmpeg.input(audio_path)
        print(input_audio)
        #load in the video with matchich trialIndex and SessionIndex
        print('Loading the video')
        video_path = os.path.join(trialfolder, f"{sessionIndex}_trial_{trialIndex}_video_raw.mp4")
        if not os.path.exists(video_path):
            print(f"Video file not found: {video_path}")
        input_video = ffmpeg.input(video_path)
        print(input_video)


          #combine the audio and video
        print('Combining audio and video')
        output_path = os.path.abspath(os.path.join(trialfolder, f"{sessionIndex}_trial_{trialIndex}_final.mp4"))
        ffmpeg.concat(input_video, input_audio, v=1, a=1).output(output_path).run(overwrite_output=True)


        #save it
        print('Saving the video')

            
        

In [None]:
os.listdir(os.path.abspath(videofolder))

In [None]:
print(wavloc)