# Donders MML: XDF processing
The script processes all data files sotred in an XDF format. 
Steps: 
1. Import libraries necessary for processing audio, video and data files 
    a. See requirements.txt to conda install all the necessary packages. 
    
2. Identify XDF files within a specified directory or its subdirectories.

3. ... 


## 0. Import all the necessary packages to work with XDF, Audio and Video files 

In [6]:
import os  # Importing the os module which provides functions for interacting with the operating system
import pyxdf  # Importing pyxdf, a Python library for reading XDF files
import glob  # Importing the glob module which helps in finding files/directories with specific patterns
import pandas as pd  # Importing pandas library (abbreviated as pd), which is used for data manipulation and analysis
import numpy as np  # Importing numpy library (abbreviated as np), which is used for numerical computations
import wave  # Importing wave module for reading and writing WAV files (usually audio files) 
import struct  # Importing struct module which provides functions to convert between Python values and C structs
import math  # Importing math module which provides mathematical functions
import random  # Importing random module for generating random numbers
from scipy.io import wavfile  # Importing wavfile module from scipy.io (a library built on numpy), for reading and writing WAV files
import noisereduce as nr  # Importing noisereduce module for noise reduction in audio signals
import json  # Importing json module for working with JSON data
import cv2  # Importing OpenCV library for computer vision tasks
from moviepy.editor import (  # Importing various classes and functions from moviepy.editor module
                            VideoFileClip,  # Class for working with video files
                            AudioFileClip,  # Class for working with audio files
                            CompositeAudioClip)  # Class for composing audio clip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip # video  clipping fucntion 
from moviepy.video.io.VideoFileClip import VideoFileClip # alternative video clipping function
import matplotlib.pyplot as plt  # Importing pyplot library to create figures and plot data 
from matplotlib.widgets import Slider  # 
import tkinter # GUI toolkit to open and save files
from tkinter import filedialog # GUI toolkit to open and save files

print("Everything was imported succesfully") #as terminal

Everything was imported succesfully


## 1. Define the Relevant Paths, Variables & Functions

In [9]:
# PATHS

experiment_to_process = './data_raw/'  # input folder with the raw XDF files 
outputfolder = './data_processed/'  # output folder where the raw extracted data will be saved 

print("Input folder =", os.path.abspath(experiment_to_process))
print("Output folder =", os.path.abspath(dataprocessfolder))


#VARIABLES 
noise_reducelevel = 1.5  #This can be changed accordingly 


# FUNCTIONS
# AUDIO: Creating a function named "to_audio" tht writes audio data (input) and transforms into a WAV file (output). 
def to_audio(fileloc, timeseriestype, samplerate = 16000, channels = 1):   
    """
    This function - named "to_audio" - writes audio data to a WAV file.
    It accepts the following parameters:
    - fileloc (str): Location to save the audio file.
    - timeseriestype (list): Audio data to be written into the file.
    - samplerate (int, optional): Sampling rate of the audio data. Defaults to 16000.
    - channels (int, optional): Number of audio channels (mono or stereo). Defaults to 1 (mono)
    """
    if 'Mic' in timeseriestype:  #Condition check that the timeseriestype belongs to the microphone.
            
        obj = wave.open(fileloc,'w')        # Opens audio file using the wave.open() function write mode ('w'). Assigns data it to the variable obj.
        obj.setnchannels(channels)          # Sets the number of channels in the audio file using obj.setnchannels(channels). Deafault 1 channel (mono).
        obj.setsampwidth(2)                 # Sets the sample width in bytes using obj.setsampwidth(2). The value '2' indicates 16-bit audio.
        obj.setframerate(float(samplerate)) # sets the frame rate of the audio file using obj.setframerate(float(samplerate)), where samplerate is provided as a parameter.
            
        for i in timeseries:                      # Loop to iterate over each time-point in the temeseries stream
            data = struct.pack('<h', int(i[0]))   # Converts the first value of the timeseries to an integer and packs it into a binary string (struck.pack()) according to the '<h' fromat (i.e., short integer (16 bits) in little-endian byte order)   
            obj.writeframesraw( data )            # Writes the packed binary data into an audio file using the wave function writeframesraw() from the wave library 
        obj.close()                               # Closes the audio file 

print("Function \"to_audio\" created sucesfully") 

Input folder = C:\Users\ahmar\OneDrive\Documents\GitHub\Mobile-Multimodal-Lab\2_PREPROCESSING\XDF_PROCESSING\data_raw
Output folder = C:\Users\ahmar\OneDrive\Documents\GitHub\Mobile-Multimodal-Lab\2_PREPROCESSING\XDF_PROCESSING\data_processed
Function "to_audio" created sucesfully


## 2. Identifying XDF files in Input Folder or any Subfolder 

In [8]:
xdf_files = []  # Initialize an empty list to store paths of XDF files

# Traverse through the directory and its subdirectories to find XDF files
for root, dirs, files in os.walk(experiment_to_process):  # 1st loop iterating over the results returned by os.walk().
    
    for file in files:                                    # 2nd loop iterating through each file in the current directory
        
        if file.endswith(".xdf"):                         # checking if the file has and XDF extension 
            
             xdf_files.append(os.path.join(root, file))   # if the file is an XDF file, append its full path to the xdf_files list
            
print('We have idenified the following XDF files: ' + str(xdf_files))

We have idenified the following XDF files: ['./data_raw/T1_experiment.xdf']


## 2a. Alternatively, the user can select their own XDF file

In [4]:
root = tkinter.Tk()
root.attributes('-topmost',True)
root.iconify()

xdf_files = filedialog.askopenfilename(title="Select an XDF file", filetypes=[("XDF Files", "*.xdf")])

root.destroy()

print ('You have selected the following XDF file: ' + str(xdf_files))

You have selected the following XDF file: C:/Users/ahmar/OneDrive/Documents/GitHub/Mobile-Multimodal-Lab/2_PREPROCESSING/XDF_PROCESSING/data_raw/T1_experiment.xdf


# 3. Main Loop that Plots and Extracts each data stream from each XDF and saves as CVS or WAV file

In [31]:

for xdf_file in xdf_files:               # Iterate over each path in the list "xdf_files". 

    print('loading xdf file: ' + xdf_file )

    streams, header = pyxdf.load_xdf(xdf_file)    # Loading steams and header information from each XDF file usign the load_xdf function from the pyxdf library. 
    fnam = os.path.basename(xdf_file)[:-4]        # Extract the file name from the path and assings it to fnam, whilst removing the '.xdf' extension (i.e., the last 4 characters in the string)
    
    # Navigating through each stream and saving it as a csv, or additinoally as a .wav file if audio
       
    stream_count = {}   # Dictionary to keep track of the count of each stream type
    for stream in streams:                        # Iterate over each steam in the loaded steam for each XDF file .
        timeseriestype = stream['info']['name'][0]                       # Extracts information (info and 1st name) for each steams (dictionary structure) and assigns it to timeseriestype (e.g., Mic). 
        samplerate = round(float(stream['info']['nominal_srate'][0]))    # Extracts the rounded sampling rate (nominal_srate) and assings it to samplerate 
        channelcount = stream['info']['channel_count'][0]                # Extracts the number of channel for each steams and assigns it to channelcount 

        # Increment the count for this stream type
        if timeseriestype in stream_count:
            stream_count[timeseriestype] += 1
        else:
            stream_count[timeseriestype] = 1
        
        count = stream_count[timeseriestype]  # Get the current count for the stream type
        
        print('working on stream: ' + timeseriestype + '  with a channel count of ' + str(channelcount) +'\n and a sampling rate of ' + str(samplerate))
        
        timevec = stream['time_stamps']            # Extract the time_stamps (i.e.,LSL TIMESTAMPS) from the "stream" dictionary and assigns it to the variable timevec
        timeseries = stream['time_series']         # Extract the time_series (i.e., DATA) from the "stream" dictionary and 

        ## -------- PLOTTING the Data Stream -----------
        #plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
        # Convert timeseries to a numpy array if it is not already
       # timeseries = np.array(timeseries)
        #for i in range(int(channelcount)):
         #   plt.plot(timevec, timeseries[:, i], label='Channel ' + str(i + 1))
       # plt.title(timeseriestype + 'Stream')
        #plt.xlabel('Time')
        #plt.ylabel('Amplitude')
        #plt.legend()
        #plt.grid(True)
        #plt.show()
        # -------------------------------------------------


        # We save each stream of the XDF file as a CSV (if it doesn't exist yet) 
        timevec = stream['time_stamps']                                            # Extract the time_stamps (i.e.,LSL TIMESTAMPS) from the "stream" dictionary and assigns it to the variable timevec
        timeseries = stream['time_series']                                         # Extract the time_series (i.e., DATA) from the "stream" dictionary and 
        matrix_aux = np.vstack([np.transpose(timevec),np.transpose(timeseries)])   # Create matrix_aux by concatenating the transposed timevec and timeseries
        matrix     = np.transpose(matrix_aux)                                      # Create new matrix by tranposing matrix_aux
        df_lab = pd.DataFrame(matrix)                                              # Converts the numerical array into a DataFrame called df_lab
        if count == 1: 
            df_lab.to_csv(outputfolder + fnam + '_' + timeseriestype + '_nominal_srate' + str(samplerate)  + '.csv',index=False)  
        else: 
            df_lab.to_csv(outputfolder + fnam + '_' + timeseriestype + '_nominal_srate' + str(samplerate) + '_' + str(count) + '.csv',index=False)     #Saving the df_lab as CSV file named [fnam]_[timeseriestype]_[nomilar_srate#].cvs
       

        if "Mic" in timeseriestype:  # Check if the data stream is from a microphone
            wavloc = os.path.abspath(outputfolder + fnam + '_' + timeseriestype + '_nominal_srate' + str(samplerate) + '.wav')  # Define the location to save the initial audio file
            to_audio(wavloc, timeseries)  # Convert the time series data to an audio file and save it at the defined location
            rate, data = wavfile.read(wavloc)  # Load the audio data from the saved WAV file
            reduced_noise = nr.reduce_noise(y=data, sr=rate, n_std_thresh_stationary=noise_reducelevel, stationary=True)    # Perform noise reduction based on the noise_reducelevel 
            wavloc2 = os.path.abspath(outputfolder + fnam + '_' + timeseriestype + '_nominal_srate' + str(samplerate) + '_denoised.wav')  # Define the location to save the noise-reduced audio file
            wavfile.write(wavloc2, rate, reduced_noise)  # Save the noise-reduced audio data as a new WAV file at the defined location

pint("Done! You can now look into your folder: " + outputfolder)

loading xdf file: ./data_raw/T1_experiment.xdf
working on stream: AudioEvents  with a channel count of 1
 and a sampling rate of 0
working on stream: MyWebcamFrameStream_2  with a channel count of 1
 and a sampling rate of 500
working on stream: MyWebcamFrameStream_1  with a channel count of 1
 and a sampling rate of 500
working on stream: OpenSignals  with a channel count of 5
 and a sampling rate of 1000
working on stream: OpenSignals  with a channel count of 5
 and a sampling rate of 1000
working on stream: Mic  with a channel count of 1
 and a sampling rate of 16000


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ahmar\\OneDrive\\Documents\\GitHub\\Mobile-Multimodal-Lab\\2_PREPROCESSING\\XDF_PROCESSING\\data_processed\\T1_experiment_Mic_nominal_srate16000.wav'

## 4. Cutting the Videos according to the Start and End of LSL Frames 

In [None]:
videofolder = './video/'

for file in os.listdir(os.path.abspath('./data_processed/')):
    print(file)
    if 'MyWebcamFrameStream' in file:
        print('Now processing file: ' + file)
        filepath = os.path.join(os.path.abspath('./data_processed/'), file)

        # Loading the CSV file 
        trialdata = pd.read_csv(filepath)  # Reads the CSV file at the constructed path into a DataFrame called trialdata
        
        begin_time = trialdata['0'].min()  # Extracts the minimum value from the '0' column in the trialdata DataFrame, which represents the begin time of the trial.
        begin_frame = int(trialdata['1'].min()) # Extracts the minimum value from the 'frame' column in the trialdata DataFrame, converts it to an integer, and assigns it to begin_frame

        end_time = trialdata['0'].max() 
        end_frame = int(trialdata['1'].max())

        tot_frames = end_frame - begin_frame  # Total number of frames from start to finish recording.
        frames = range(begin_frame, end_frame)  # Get all the frames in trial

        # Loading the videos 
        for video in os.listdir(os.path.abspath(videofolder)):
            print(video)
            if 'P1' in video:
                print('Now processing file: ' + video)
                video_filepath = os.path.join(os.path.abspath(videofolder), video)
                capture = cv2.VideoCapture(video_filepath)

                # Meta Data about the Video 
                frameWidth = capture.get(cv2.CAP_PROP_FRAME_WIDTH)  # Check frame width
                frameHeight = capture.get(cv2.CAP_PROP_FRAME_HEIGHT)  # Check frame height
                frate = capture.get(cv2.CAP_PROP_FPS)
                print('frame rate: ' + str(frate))
                originalfps = round((tot_frames / (end_time - begin_time)), 3)
                print('original fps: ' + str(originalfps))

                # Start Writing the Video 
                fourcc = cv2.VideoWriter_fourcc(*'M', 'J', 'P', 'G')  # For different video formats you could use e.g., *'XVID'
                vidloc = os.path.join(videofolder, f'{video.split(".")[0]}_cut.mp4')  # Location to save the new video
                out = cv2.VideoWriter(vidloc, fourcc, fps=originalfps, frameSize=(int(frameWidth), int(frameHeight)))
                frame_count = 0

                print('Looping over the frames')
                while capture.isOpened():
                    # Read the next frame
                    ret, frame = capture.read()
                    if ret:
                        # Increment the frame count
                        frame_count += 1
                        if frame_count in frames:
                            out.write(frame)
                        if frame_count > end_frame:
                            break

                capture.release()
                out.release()
                print(f'Video saved to {vidloc}')


T1_experiment_AudioEvents_nominal_srate0.csv
T1_experiment_Mic_nominal_srate16000.csv
T1_experiment_MyWebcamFrameStream_1_nominal_srate500.csv
Now processing file: T1_experiment_MyWebcamFrameStream_1_nominal_srate500.csv
T1_P1_exp_2024-04-23_output_compr.avi
Now processing file: T1_P1_exp_2024-04-23_output_compr.avi
frame rate: 60.0
original fps: 54.96
Looping over the frames


# 5. Concatenate (cut) Videos with Audios

In [36]:
wavloc = os.path.join(os.path.abspath('./data_processed/')

if not os.path.exists(wavloc):
    print(f"Directory not found: {wavloc}")


# loop over Audio files
for file in os.listdir(wavloc):
    print(file)
    if 'Mic_nominal_srate16000_denoised' in file:
        print('Now processing file '+file)
        sessionIndex = file.split('_')[0]   # this is session number
        trialIndex = file.split('_')[2] # this is trial number
        #load in the audio
        print('Loading the audio')
        audio_path = os.path.join(wavloc, file)
        if not os.path.exists(audio_path):
            print(f"Audio file not found: {audio_path}")
        # input the video with ffmpg
        input_audio = ffmpeg.input(audio_path)
        print(input_audio)
        #load in the video with matchich trialIndex and SessionIndex
        print('Loading the video')
        video_path = os.path.join(trialfolder, f"{sessionIndex}_trial_{trialIndex}_video_raw.mp4")
        if not os.path.exists(video_path):
            print(f"Video file not found: {video_path}")
        input_video = ffmpeg.input(video_path)
        print(input_video)


          #combine the audio and video
        print('Combining audio and video')
        output_path = os.path.abspath(os.path.join(trialfolder, f"{sessionIndex}_trial_{trialIndex}_final.mp4"))
        ffmpeg.concat(input_video, input_audio, v=1, a=1).output(output_path).run(overwrite_output=True)


        #save it
        print('Saving the video')
        

d


In [43]:
os.listdir(os.path.abspath(videofolder))

['T1_P1_exp_2024-04-23_output_compr.avi',
 'T1_P2_exp_2024-04-23_output_compr.avi']