<font size="5.5"><u><i>Build Strain Dataset</i></u></font>

<font size="4">Script to build a dataset with multiple waveforms injections</font>
<br/>
<font size="4">Author: Manuel David Morales</font>

## 1. Library imports

In [1]:
# Data analysis
import numpy as np 
import pandas as pd
import math

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt

# Files/folders management
import os, glob, sys, re

# To read csv files
import csv

# Scientific computing
from scipy import signal

# Garbage collector
import gc

# Object serialization
import pickle

# Toolbox functions
from Toolbox import PSD, SNR, WaveletTF_transform

## 2. Input parameters

In [2]:
# Interferometer for noise data
# ------------------------------------------------------------
detector = "V1"    # Options: "L1", "H1", "V1"
# ------------------------------------------------------------

# ------> Input parameters for window samples generation

# Jitter_lim: This value defines a fluctuation range, in which each
# injection will be located on a random sample in the interval
# [inj_time - jitter_lim, inj_time + jitter_lim] (in seconds)

# -------------------------------------------------------------------------------------
jitter_lim = 0.01   # Ensure this value is the same as in Make_Injections script
# -------------------------------------------------------------------------------------

# Scalar parameter in window time for samples (in seconds)
# ------------------------------------------------------------
alpha = 0.02001    # alpha must be greater than 2*jitter_lim
# ------------------------------------------------------------

# Plot windows samples (time & TF domain)
# ------------------------------------------------------------
set_doplots = 0    #  1: yes | 0: no
# ------------------------------------------------------------

In [3]:
# ------> Check

if alpha <= 2*jitter_lim:
    print("")
    print("******** Error: alpha must be greater than 2*jitter_lim ********")
    print("")
    
    raise SystemExit("Code stopped here!")
    
else:
    pass

# Remark: The condition alpha > 2*jitter_lim ensures that
#         the whole injected waveform will be in the window sample.

## 3. Read files

In [4]:
# ------> Specify folder location

rawdata_dir = '/home/manuel/Research Projects/GW Data analysis/GitHub/CCSNeHFGW_ResNetClass/Codes/Preprocessed_Data'

# ------> Initialize lists

time = []
strain = []
inj_time = []
jitt = []
SNR_waveform = []
gmode_slope = []
f0 = []
f1 = []
duration_wf = []

s_files = []
l_files = []

# ------> Scan strain data and log data files

os.chdir(rawdata_dir)

for file in glob.glob("strain_" + detector + "*cond*"):
    s_files.append(file)
    
for file in glob.glob("log_" + detector + "*"):
    l_files.append(file)

In [5]:
print("Available log data files")
for file in range(len(l_files)):
    print(l_files[file] + "  |  Input option :", file)
print("")

file_i = input("======> Enter your option:")
file_i = int(file_i)
print("")

# ------> Load log file with information about injections

print("***** READING LOG DATA FILE", l_files[file_i], " *****")
with open(l_files[file_i]) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Data columns: {", ".join(row)}')
            line_count += 1
        else:
            inj_time.append(row[0])
            jitt.append(row[1])
            SNR_waveform.append(row[2])
            gmode_slope.append(row[3])
            f0.append(row[4])
            f1.append(row[5])
            duration_wf.append(row[6])
            #print(f'\t{row[0]} works in the {row[1]}.')
            line_count += 1
    print(f'Processed {line_count} lines')

Available log data files
log_V1_1256783872_wfclass3.dat  |  Input option : 0
log_V1_1256783872_wfclass1.dat  |  Input option : 1
log_V1_1257050112_wfclass3.dat  |  Input option : 2
log_V1_1257050112_wfclass1.dat  |  Input option : 3
log_V1_1257050112_wfclass2.dat  |  Input option : 4
log_V1_1256783872_wfclass2.dat  |  Input option : 5






***** READING LOG DATA FILE log_V1_1257050112_wfclass3.dat  *****
Data columns: Injection time [s], jitter (seconds), Waveform SNR, G-mode slope, Frequency f0 [Hz], Frequency f1 [Hz], Waveform duration [s]
Processed 512 lines


In [6]:
print("Available conditioned strain data files")
for file in range(len(s_files)):
    print(s_files[file] + "  |  Input option :", file)
print("")

file_i = input("======> Enter your option:")
file_i = int(file_i)
print("")

# ------> Load strain data (this data include injections) 

print("***** READING FILE", s_files[file_i], " *****")
with open(s_files[file_i]) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Data columns: {", ".join(row)}')
            line_count += 1
        else:
            time.append(row[0])
            strain.append(row[1])
            #print(f'\t{row[0]} works in the {row[1]}.')
            line_count += 1
    print(f'Processed {line_count} lines')

Available conditioned strain data files
strain_V1_1256783872_conditioned_wfclass2.txt  |  Input option : 0
strain_V1_1257050112_conditioned_wfclass1.txt  |  Input option : 1
strain_V1_1256783872_conditioned_wfclass3.txt  |  Input option : 2
strain_V1_1257050112_conditioned_wfclass3.txt  |  Input option : 3
strain_V1_1257050112_conditioned_wfclass2.txt  |  Input option : 4
strain_V1_1256783872_conditioned_wfclass1.txt  |  Input option : 5






***** READING FILE strain_V1_1257050112_conditioned_wfclass3.txt  *****
Data columns: time, strain
Processed 16736257 lines


In [7]:
# ------> Convert lists to numpy arrays, clear memory from lists

t = np.array(time, dtype='float64')
del(time)
gc.collect()

s = np.array(strain, dtype='float64')
del(strain)
gc.collect()

t_inj = np.array(inj_time, dtype='float64')
del(inj_time)
gc.collect()

jitter = np.array(jitt, dtype='float64')
del(jitt)
gc.collect()

wf_SNR = np.array(SNR_waveform, dtype='float64')
del(SNR_waveform)
gc.collect()

Slope = np.array(gmode_slope, dtype='float64')
del(gmode_slope)
gc.collect()

f_ini = np.array(f0, dtype='float64')
del(f0)
gc.collect()

f_end = np.array(f1, dtype='float64')
del(f1)
gc.collect()

wf_duration = np.array(duration_wf, dtype='float64')
del(duration_wf)
gc.collect()

0

In [8]:
# ------> Check array dimensions

print("Time array length:", len(t))
print("Strain array length:", len(s))
print("")
print("Injection times array length:", len(t_inj))
print("WF SNR array length:", len(wf_SNR))
print("WF durations array length:", len(wf_duration))

Time array length: 16736256
Strain array length: 16736256

Injection times array length: 511
WF SNR array length: 511
WF durations array length: 511


## 4. Window samples preparation

In [9]:
# ------> Compute maximum waveform duration (in seconds)
# Remark: For this calculation, we consider all waveforms saved on Waveforms folder

waveforms_dir = "/home/manuel/Research Projects/GW Data analysis/GitHub/CCSNeHFGW_ResNetClass/Waveforms_mod/Phen/"

# Load waveforms log data dictionary
with open(waveforms_dir + "waveforms_log.pkl", 'rb') as fp:
    waveforms_log = pickle.load(fp)
    
# Extract duration of all waveforms

wf_duration_all = []

label = ["class 1", "class 2", "class 3"]

for class_wf in label:
    for i in range(len(waveforms_log[class_wf])):
        wf_duration_i = waveforms_log[class_wf][i][3]
        wf_duration_all.append(float(wf_duration_i))

# Compute maximum duration
        
wf_max = max(wf_duration_all)

In [10]:
# ------> Define a UNIQUE time window duration (in seconds)

Twin = wf_max + alpha

Twin_rounded = math.ceil(Twin * 1e5) / 1e5

In [11]:
# ------> Print waveforms information

print("Maximum waveform duration (in seconds)")
print("wf_max = ", wf_max)
print("")
print("Alpha scalar parameter (in seconds)")
print("alpha = ", alpha)
print("")
print("Duration of window strain samples (in seconds)")
print("Twin = wf_max + alpha = ", Twin)
print("Twin rounded = ", Twin_rounded)

# Use Twin_rounded hereinafter

Twin = Twin_rounded

Maximum waveform duration (in seconds)
wf_max =  0.94970703125

Alpha scalar parameter (in seconds)
alpha =  0.02001

Duration of window strain samples (in seconds)
Twin = wf_max + alpha =  0.96971703125
Twin rounded =  0.96972


In [12]:
# ------> Initialize array for initial times of each sample window

Twin_ini = np.zeros(len(t_inj))

# ------> Populate above array with time values

for i in range(len(t_inj)): # Select all windows
#for i in range(10): # Select only a few windows
    
    # Remark: The following expression ensures beforehand that each window sample
    #         is such that its injection, without jitter variation, is centered.
    
    Twin_ini[i] =  t_inj[i] + 0.5 * ( wf_duration[i] - Twin ) - jitter[i]
    #print(Twin_ini[i])

In [13]:
# ------> Check if window samples at the edges are in the segment

# RIGHT edge of data segment
for i in range(len(Twin_ini)):
    
    if Twin_ini[0] < t[0]:
        print("****** WARNING: Sample at LEFT edge is outside the data segment, it will be removed")
        Twin_ini = np.delete(Twin_ini,0)
        # We also remove the injection
        t_inj = np.delete(t_inj,0)
    else:
        print("End LEFT segment check")
        break

# RIGHT edge of data segment
for i in range(len(Twin_ini)):
    
    #if Twin_ini[-i-1] + Twin > t[-i-1]: # SE CORRIGIO ESTO!!!!
    if Twin_ini[-1] + Twin > t[-1]: 
        print("****** WARNING: Sample at RIGHT edge is outside the data segment, it will be removed")
        #Twin_ini = np.delete(Twin_ini,-i-1) # SE CORRIGIO ESTO!!!!
        Twin_ini = np.delete(Twin_ini,-1)
        # We also remove the injection
        t_inj = np.delete(t_inj,-i-1)
    else:
        print("End RIGHT segment check")
        break

End LEFT segment check
End RIGHT segment check


In [14]:
# ------> Convert Twin_ini values (in seconds) to locating indexes

ts = t[1]-t[0]

locate_win = np.zeros(len(Twin_ini))

for i in range(len(Twin_ini)):    
    locate_win[i] = int(Twin_ini[i]/ts) - int(t[0]/ts)

locate_win = locate_win.astype(int)

In [15]:
# ------> Checks: Twin_ini vs. t_inj

print("Length of vector for time injections: ", len(t_inj))
print("Length of vector for windows's initial time: ", len(Twin_ini))
#print("Length of locate_win vector : ", len(locate_win))
print("Length of window samples (in seconds) : ", Twin)

for i in range(len(Twin_ini)):
    if t_inj[i] <= Twin_ini[i]:
        print("")
        print("****** ERROR: At sample ", i, "Injection's start is outside the window (LEFT)")
        print("Window's initial time (seconds) =", Twin_ini[i])
        print("Injection initial time (seconds) =", t_inj[i])
        print("----------------------------------------------")
        print("")
    elif t_inj[i]+wf_duration[i] >= Twin_ini[i]+Twin:
        print("****** ERROR: At sample ", i, "Injection's end is outside the window (RIGHT)")
        print("Window's end time (seconds) =", Twin_ini[i]+Twin)
        print("Injection end time (seconds) =", t_inj[i]+wf_duration[i])
        print("----------------------------------------------")
        print("")
    else:
        #print("")
        #print("Sample", i)
        #print("Window's initial time (seconds) =", Twin_ini[i])
        #print("----------------------------------------")
        #print("Injection initial time (seconds) =", t_inj[i])
        #print("Injection end time (seconds) =", t_inj[i]+wf_duration[i])
        #print("----------------------------------------")
        #print("Window's end time (seconds) =", Twin_ini[i]+Twin)
        #print("")
        pass

#for i in range(len(locate_win)):
#for i in range(5):
#    print(Twin_ini[i])
#    print(t[locate_win[i]])
#    print(t_inj[i])
#    print("")

Length of vector for time injections:  511
Length of vector for windows's initial time:  511
Length of window samples (in seconds) :  0.96972


In [16]:
# ------> Set directory to save dataset

r = re.search("_conditioned_", s_files[file_i])
i_end = r.span()[0]
gps = s_files[file_i][10:i_end]

waveform_class = s_files[file_i][-5:-4]

save_dir = '/home/manuel/Research Projects/GW Data analysis/GitHub/CCSNeHFGW_ResNetClass/Datasets/'

datasets_dir = save_dir + detector + "_" + gps + "/wfclass_" + waveform_class 

if not os.path.exists(datasets_dir):
    os.makedirs(datasets_dir)
else:
    pass

## 5. Window samples generation

In [17]:
doplots_samples = set_doplots # CHECK, plot samples (time domain | TF representation)

# ------> Generate window samples

# Sampling frequency
ts = t[1] - t[0]
fs = 1 / ts

# Window for spectrograms
wnd="hamming"

# Initialize list of windows
all_windows = []

# Initialize list for updated log file
log_data = []

# FIRST LOOP, j index: window count variable
# --------------------------------------------

for j in range(len(t_inj)): # Extract all windows along the strain segment
#for j in range(5): # Extract only a few windows along the strain segment

    print("Window sample No.", j)
    print("----------------------")
    
    window = np.zeros(int(fs*Twin))

    # SECOND LOOP, k index: Window samples count variable
    # --------------------------------------------------------
    
    for k in range(len(window)):
        
        # Extract strain data along the window
        window[k] = s[locate_win[j]+k]
    
    all_windows.append(window)
    
    print("Waveform SNR: ", wf_SNR[j])
    print("Injection location (seconds): ", t_inj[j])
    print("")
    print("G-mode slope: ", Slope[j])
    print("Initial frequency, f0 (Hz): ", f_ini[j])
    print("End frequency, f1 (Hz): ", f_end[j])
    print("Waveform duration (seconds): ", wf_duration[j])
    print("")
    print("Window initial time (seconds): ", t[locate_win[j]])
    print("Window end time (seconds): ", t[locate_win[j]+k])
    #print("Window length (in seconds):", t[locate_win[j]+k]-t[locate_win[j]])
    print("Extracted ", k, " data points")
    
    # CHECK: To ensure injection location is inside window sample
    
    if t_inj[j] < t[locate_win[j]]:
        print("!!!!!!!!!!!!!!!!! ERROR : t_injection is before the sample window")
    
    elif j < len(t_inj)-1:
        if t_inj[j] > t[locate_win[j+1]]:
            print("!!!!!!!!!!!!!!!!! ERROR : t_injection is after the sample window ")
    
    print("")
    
    # Select window samples belonging in a specific SNR range
    # ----------------------------------------------------------
    
    #if wf_SNR[j] >= 18:
        #if wf_SNR[j] <= 24:
    if wf_SNR[j] <= 100:
        
        if doplots_samples:
            
            # Plot the window sample (strain)
            # ---------------------------------
            
            plt.figure(1, figsize=(6,4))
        
            #plt.plot(np.arange(0, Twin, ts), all_windows[num_win], label='Strain')
            plt.plot(t[locate_win[j]:locate_win[j]+len(window)], all_windows[j], label='Strain')
            #plt.title("Sample", fontsize=15)
            plt.xlabel('Time [sec]', fontsize=14)
            plt.ylabel('strain', fontsize=14)
            plt.grid()
            plt.legend()
            plt.tight_layout()
            plt.show()
            plt.figure(1).clear()
            gc.collect()
    
            # Plot the window sample (TF representation)
            # --------------------------------------
    
            #freq, time, Sxx = signal.spectrogram(all_windows[j], fs, window=wnd, nperseg=256)
            time, freq, Sxx = WaveletTF_transform(all_windows[j], fs, 10, 2000, 10, 7, 0)
    
            plt.figure(2, figsize=(6.5,5))
            plt.pcolormesh(time+t[locate_win[j]], freq, Sxx, shading='gouraud')
            plt.ylabel('Frequency [Hz]', fontsize=14)
            plt.xlabel('Time [sec]', fontsize=14)
            plt.show()
        
            plt.figure(2).clear()
            gc.collect()
            
        # Save information in log_data list
        log_data.append([t_inj[j], jitter[j], wf_SNR[j], Slope[j], f_ini[j], f_end[j], wf_duration[j]])
    
        # Save window samples
        sample_number = str(j).zfill(6)
        df_strain = pd.DataFrame({"time (seconds)" : t[locate_win[j]:locate_win[j]+len(window)], "strain" : all_windows[j]})
        df_strain.to_csv(datasets_dir + "/sample_strain_" + sample_number + ".txt", index=False)
 
        print("****************************************************************")
        print("")

Window sample No. 0
----------------------
Waveform SNR:  43.11939180504554
Injection location (seconds):  8.00390625

G-mode slope:  1131.0
Initial frequency, f0 (Hz):  114.07
End frequency, f1 (Hz):  1399.5
Waveform duration (seconds):  0.8037109375

Window initial time (seconds):  7.9169921875
Window end time (seconds):  8.88623046875
Extracted  3970  data points

****************************************************************

Window sample No. 1
----------------------
Waveform SNR:  52.68671131340743
Injection location (seconds):  15.9970703125

G-mode slope:  990.0
Initial frequency, f0 (Hz):  102.26
End frequency, f1 (Hz):  1192.46
Waveform duration (seconds):  0.849609375

Window initial time (seconds):  15.93994140625
Window end time (seconds):  16.9091796875
Extracted  3970  data points

****************************************************************

Window sample No. 2
----------------------
Waveform SNR:  56.12412559369272
Injection location (seconds):  24.009033203125



## 6. Update and save log data

In [18]:
# ------> Create df_log dataframe
df_log = pd.DataFrame(log_data, columns=['Injection time [s]', 'jitter (seconds)', 'Waveform SNR', 'G-mode slope', 'Frequency f0 [Hz]', 'Frequency f1 [Hz]', 'Waveform duration [s]'])

# ------> Export df_log dataframe to a csv file
df_log.to_csv(datasets_dir + '/log.dat', index=False)

# 