### To get the data for domain adaptation and put it into a desired form

In [4]:
import pyspedas
import pytplot
import numpy as np
import xarray as xr
import h5py
import datetime as dt
from IPython.display import clear_output
import sys  # for debugging
import os
from scipy.constants import physical_constants

In [5]:
# Parameters (will change depending on which model we are using)
window = 30
stride = 10 
n_avg_width = 10  # for smoothing the plasma freq

# Physical constants (for unit conversion and plasma freq calc.)
c = physical_constants["speed of light in vacuum"][0]  # m/s
e = physical_constants["elementary charge"][0]  # coulombs
m_e = physical_constants["electron mass"][0]  # kg
m_i = physical_constants["proton mass"][0]  # kg
e_0 = physical_constants["vacuum electric permittivity"][0]  # Farads/m
k = physical_constants["Boltzmann constant"][0]  # J/K

# PIC characteristic values (MAKE SURE THESE ARE CORRECT FOR THE SIMULATION YOU ARE PROCESSING)
mime_pic = 25
v_the_sim_cs = 0.25
v_the_sim_out = 0.15
n_sim = 1
n_sim_out = 0.01/2.  # beta/2

# MMS characteristic values
# assumed tail current sheet density, cm^-3
n_cs = 0.625
# assumed tail electron temperature, keV
e_the_tail = 3.795
# assumed lobe density, cm^-3 (Frank 1985)
n_lobe = 0.1
# assumed lobe temperature, K then J
e_the_lobe_K = 1e6
e_the_lobe_j = e_the_lobe_K*k

In [6]:
# Unit conversion and normalization supportive calculations
e_the_j = e_the_tail*1e3*e  # joules
print(e_the_j, e_the_lobe_j)
v_the_tail = np.sqrt(2*e_the_j/m_e)/c  # velocity in PIC units
v_the_lobe = np.sqrt(2*e_the_lobe_j/m_e)/c
print(v_the_tail, v_the_lobe)
print(v_the_sim_cs/v_the_tail, v_the_sim_out/v_the_lobe)
n_cs_m3 = n_cs*(100)**3  # cm^-3 to m^-3
plasma_freq = np.sqrt(n_cs_m3*e**2/e_0/m_e) #s^-1

# convert mms density to pic units
d_e = c/plasma_freq  # in m
n_cs_sim = n_cs_m3*d_e**3  # in d_e^(-3)
print('{:.2e}'.format(n_cs_sim))

6.08026032603e-16 1.3806490000000001e-17
0.12187394915783151 0.018365021386649957
2.0512997381929443 8.16770080698295
1.90e+17


In [7]:
# electric field correction...
# the units need to agree outside the log but inside the logs they just need to be locally consistent, because we are doing a ratio and the units are just a scale factor different
E_val_sim = np.sqrt(1/mime_pic)*v_the_sim_cs**2*np.log(n_sim_out*v_the_sim_out**2/(n_sim*v_the_sim_cs**2))
E_val_tail = np.sqrt(m_e/m_i)*(v_the_tail)**2*np.log(n_lobe*e_the_lobe_j/(n_cs*e_the_j))
E_corr_factor = E_val_sim/E_val_tail
print(E_val_sim, E_val_tail, E_corr_factor)

-0.07899960767600023 -0.0019472551195471386 40.56972652580453


In [8]:
# current density correction
j_val_sim = mime_pic**(-0.5)*n_sim*v_the_sim_cs**2
j_val_tail = np.sqrt(m_e/m_i)*n_cs_sim*v_the_tail**2
j_corr_factor = j_val_sim/j_val_tail
print(j_corr_factor)

1.8997442222394003e-16


### Read in the tail region times created by mms_region_to_time.ipynb

In [43]:
with h5py.File('/tigress/kendrab/analysis-notebooks/mms_data/interval_times.h5', 'r') as file:
    # this is split into two separate lines for readability
    times = file['times'][()]  # get the list of times
    times = np.vectorize(lambda x: x.decode())(times)  # decode the times

[['2015-04-05/05:57:00' '2015-04-05/08:09:00']
 ['2015-04-06/05:25:30' '2015-04-06/08:39:00']
 ['2015-04-07/05:02:30' '2015-04-07/09:00:00']
 ['2015-04-08/04:43:00' '2015-04-08/09:17:00']
 ['2015-04-09/04:26:30' '2015-04-09/09:31:00']
 ['2015-04-10/04:11:30' '2015-04-10/09:43:00']
 ['2015-04-11/03:57:30' '2015-04-11/09:53:30']
 ['2015-04-12/03:44:30' '2015-04-12/10:03:00']
 ['2015-04-13/03:32:30' '2015-04-13/10:11:30']
 ['2015-04-14/03:21:00' '2015-04-14/10:18:30']
 ['2015-04-15/03:10:00' '2015-04-15/10:25:30']
 ['2015-04-16/02:59:30' '2015-04-16/10:31:30']
 ['2015-04-17/02:49:30' '2015-04-17/10:36:30']
 ['2015-04-18/02:40:30' '2015-04-18/10:41:30']
 ['2015-04-19/02:31:30' '2015-04-19/10:45:30']
 ['2015-04-20/02:23:00' '2015-04-20/10:49:00']
 ['2015-04-21/02:15:00' '2015-04-21/10:51:30']
 ['2015-04-22/02:07:00' '2015-04-22/10:54:00']
 ['2015-04-23/01:59:30' '2015-04-23/10:56:00']
 ['2015-04-24/01:52:30' '2015-04-24/10:58:00']
 ['2015-04-25/01:45:00' '2015-04-25/10:59:00']
 ['2015-04-26

### Do the processing one timesegment at a time for memory reasons

In [24]:
def loop_end(i):
    """ What we want to happen at the end of every loopy boye whether continue or otherwise"""
    # delete the original sc data to free up memory
    pytplot.del_data()
    # clear the output every n entries to free up memory as well
    if (i % 15 == 14):
        clear_output(wait=True)
        print(f"Cleared output at step {i}")
    return i+1

In [None]:
i=0
while i < times.shape[0]:  # while loop instead of for loop bc time.shape can change in the loop
    # cut up intervals that are too long and will crash the process TRY WITHOUT DOING THIS NOW
    # num_files = pyspedas.mms.fpi(trange=times[i], probe='1', data_rate='brst', datatype='des-moms',
    #                             time_clip=True, available=True, varnames=["mms1_des_numberdensity_brst"])
    # while len(num_files) > 10:  # should be a generous limit
    #     # datetimes to halve the time interval
    #     start_time = dt.datetime.strptime(times[i][0], '%Y-%m-%d/%H:%M:%S')
    #     end_time = dt.datetime.strptime(times[i][1], '%Y-%m-%d/%H:%M:%S')
    #     mid_time = start_time + (end_time - start_time)/2
    #     # back to strings
    #     start_time = start_time.strftime('%Y-%m-%d/%H:%M:%S')
    #     end_time = end_time.strftime('%Y-%m-%d/%H:%M:%S')
    #     mid_time = mid_time.strftime('%Y-%m-%d/%H:%M:%S')
    #     times = np.insert(times, i, [start_time, mid_time], axis=0)  # half step becomes new ith interval, original now i+1
    #     times[i+1] = [mid_time, end_time]  # updating original interval to be second half step  
    #     print(f"Split long interval {[start_time, end_time]} into {times[i]}, {times[i+1]}")
    #     num_files = pyspedas.mms.fpi(trange=times[i], probe='1', data_rate='brst', datatype='des-moms',
    #                             time_clip=True, available=True, varnames=["mms1_des_numberdensity_brst"])
    # okay we have our dataset now    
    start_time_filestr = times[i,0].replace('/','T').replace(':','-')
    end_time_filestr = times[i,1].replace('/','T').replace(':','-')
    outfile = f'/tigress/kendrab/analysis-notebooks/mms_data/mms_slices/{start_time_filestr}_{end_time_filestr}.h5'
    # SKIP if file exists already
    if os.path.exists(outfile):
        i = loop_end(i)
        print(f"{outfile} already exists. Skipping")
        continue
    #### FPI data
    vars_tmp = pyspedas.mms.fpi(trange=times[i], probe='1', data_rate='brst', datatype='des-moms',
                                time_clip=True, varnames=["mms1_des_numberdensity_brst"])    
    if vars_tmp is None: 
        print(f"Missing FPI data for interval {times[i]}")
        i = loop_end(i)
        continue
    #### MEC data 
    mec_datarate = 'brst'  # pull this into its own variable in case we want to record whether we used brst or srvy
    vars_tmp = np.empty(4, dtype=object)
    for j in range(1,5):
        vars_tmp[j-1] = pyspedas.mms.mec(trange=times[i], probe=str(j), data_rate=mec_datarate, time_clip=False,
                                    varnames=[f"mms{j}_mec_r_gse",])
    if np.any(vars_tmp == None):  # no burst data
        print(f"Missing some MEC burst data, interval {times[i]}. Trying survey data")
        mec_datarate = 'srvy'
        vars_tmp = np.empty(4, dtype=object)   
        for j in range(1,5):  # try srvy data
            vars_tmp[j-1] = pyspedas.mms.mec(trange=times[i], probe=str(j), data_rate=mec_datarate, time_clip=False,
                                             varnames=[f"mms{j}_mec_r_gse",])
        if np.any(vars_tmp == None):  # no burst data 
            print(f"Missing some MEC data, interval {times[i]}")
            i = loop_end(i)
            continue
    #### FGM data    
    vars_tmp = np.empty(4, dtype=object)
    for j in range(1,5):
        vars_tmp[j-1] = pyspedas.mms.fgm(trange=times[i], probe=str(j), data_rate='brst', time_clip=True,
                                    varnames=[f"mms{j}_fgm_b_gsm_brst_l2",])
    if np.any(vars_tmp == None):
        print(f"Missing some FGM data, interval {times[i]}")
        i = loop_end(i)
        continue
    #### EDP data
    vars_tmp = pyspedas.mms.edp(trange=times[i], probe='1', data_rate='brst', time_clip=True,
                                varnames=["mms1_edp_dce_gse_brst_l2"]) 
    if vars_tmp is None:
        print(f"Missing EDP data for interval {times[i]}")
        i = loop_end(i)
        continue
    
    print(pytplot.data_quants.keys())
    print(pytplot.data_quants["mms1_fgm_b_gsm_brst_l2"].shape, pytplot.data_quants["mms1_edp_dce_gse_brst_l2"].shape)
    print(pytplot.data_quants["mms1_fgm_b_gsm_brst_l2"])
    print(pytplot.data_quants["mms1_edp_dce_gse_brst_l2"])
    
    # remove duplicates from data 
    pytplot.data_quants["mms1_edp_dce_gse_brst_l2"] = pytplot.data_quants["mms1_edp_dce_gse_brst_l2"].drop_duplicates(dim='time', keep='first')
    for j in range(1,5):
        pytplot.data_quants[f"mms{j}_fgm_b_gsm_brst_l2_bvec"] = pytplot.data_quants[f"mms{j}_fgm_b_gsm_brst_l2_bvec"].drop_duplicates(dim='time', keep='first')
        pytplot.data_quants[f"mms{j}_mec_r_gse"] = pytplot.data_quants[f"mms{j}_mec_r_gse"].drop_duplicates(dim='time', keep='first')

    
    # Find curlometer j- need to move to GSE and interpolate r for this
    fields=[]
    pos = []
    for j in range(1,5):
        pyspedas.cotrans(name_in = f"mms{j}_fgm_b_gsm_brst_l2_bvec", name_out = f"mms{j}_fgm_b_gse_brst_l2_bvec",
                         coord_in='gsm', coord_out='gse')
        fields.append(f"mms{j}_fgm_b_gse_brst_l2_bvec")
        pos.append(f"mms{j}_mec_r_gse")
    pyspedas.mms.curlometer(fields=fields, positions=pos) # jtotal in A/m^2
    print(pytplot.data_quants['jtotal'])

    # Interpolate E (and possibly B if we want to) to a lower data rate
    pytplot.data_quants["mms1_edp_dce_gse_brst_l2"] = \
        pytplot.data_quants["mms1_edp_dce_gse_brst_l2"].interp(method="linear", assume_sorted=False,
                                                               time=pytplot.data_quants["mms1_fgm_b_gsm_brst_l2_bvec"].time)
    
    # use pyspedas to transform E field and j data to GSM coordinates
    pyspedas.cotrans(name_in="mms1_edp_dce_gse_brst_l2", name_out="mms1_edp_dce_gsm_brst_l2", coord_in='gse', coord_out='gsm')
    pyspedas.cotrans(name_in="jtotal", name_out="jtotal_gsm", coord_in='gse', coord_out='gsm')
    
    # Convert E, B, J to typical PIC units e = 1, m_e = 1, c = 1, d_e = 1, w_pe = 1
    pytplot.data_quants["mms1_fgm_b_gsm_brst_l2_bvec"] *= 10**(-9)/m_e*e/plasma_freq  # T/nT*m_e/kg*C/e*(wpe^-1*s) -> units of m_e wpe / e #TODO FIX THESE
    pytplot.data_quants["mms1_edp_dce_gsm_brst_l2"] *= 10**(-3)/m_e*e/plasma_freq/c  # V/mV*m_e/kg*C/e*(wpe^-1*s)*(c / m/s) -> units of m_e wpe c / e 
    pytplot.data_quants["jtotal_gsm"] *= c*c/plasma_freq**3/e  # units of e wpe^3/c^2 or e wpe / de^2
    # Scale E and j based on parameter differences between PIC and MMS data
    pytplot.data_quants["mms1_edp_dce_gsm_brst_l2"] *= E_corr_factor
    pytplot.data_quants["jtotal_gsm"] *= j_corr_factor
    
    # group the data to get rid of data gaps
    next_time_interval = np.diff(pytplot.data_quants["mms1_fgm_b_gsm_brst_l2_bvec"].time)
    timestep_max = 1.1*np.median(next_time_interval) # bigger than a timestep to avoid float inaccuracy nonsense
    pre_gap_idxs = np.nonzero(next_time_interval > timestep_max)
    bin_idxs = [0,] + list(pre_gap_idxs[0]) + [len(pytplot.data_quants["mms1_fgm_b_gsm_brst_l2_bvec"].time)-1,]  # using real idx instead of -1 to avoid sorting issues
    bin_idxs = np.unique(bin_idxs)
    groups_B_cots = pytplot.data_quants["mms1_fgm_b_gsm_brst_l2_bvec"].groupby_bins("time", bins=pytplot.data_quants["mms1_fgm_b_gsm_brst_l2_bvec"].time[bin_idxs].sortby("time"),
                                                                              include_lowest=True)
    groups_E_cots = pytplot.data_quants["mms1_edp_dce_gsm_brst_l2"].groupby_bins("time", bins=pytplot.data_quants["mms1_edp_dce_gsm_brst_l2"].time[bin_idxs].sortby("time"),
                                                                              include_lowest=True)
    groups_j_cots = pytplot.data_quants["jtotal_gsm"].groupby_bins("time", bins=pytplot.data_quants["jtotal"].time[bin_idxs].sortby("time"), include_lowest=True)
    # make the data into slices
    sliced_B_list=[]
    sliced_E_list=[]
    sliced_j_list=[]
    sliced_time_list=[]
    for B_arr, E_arr, j_arr in zip(groups_B_cots, groups_E_cots, groups_j_cots): 
        if len(B_arr[1].time) > window:
            B_slices = np.lib.stride_tricks.sliding_window_view(B_arr[1].values, window, axis=0)[::stride,:,:].copy()
            E_slices = np.lib.stride_tricks.sliding_window_view(E_arr[1].values, window, axis=0)[::stride,:].copy()
            j_slices = np.lib.stride_tricks.sliding_window_view(j_arr[1].values, window, axis=0)[::stride,:].copy()                                                                                                    
            time_slices = np.lib.stride_tricks.sliding_window_view(B_arr[1].time.values, window, axis=0)[::stride,:].copy()

            sliced_B_list.append(B_slices)
            sliced_E_list.append(E_slices)
            sliced_j_list.append(j_slices)
            sliced_time_list.append(time_slices)
        else:
            print(f"Segment too short for sliding window view with length {len(B_arr[1].time)}")
    # save sliced data
    sliced_B = np.concatenate(sliced_B_list, axis=0)
    sliced_E = np.concatenate(sliced_E_list, axis=0)
    sliced_j = np.concatenate(sliced_j_list, axis=0)
    sliced_time = np.concatenate(sliced_time_list, axis=0, dtype='datetime64[us]')
    sliced_time = sliced_time.astype(object)
    sliced_time = np.vectorize(lambda x: x.strftime('%Y-%m-%dT%H:%M:%S.%f').encode('ascii'))(sliced_time)
    # remove data that has nans in it
    # check j and E then filter all
    clean_j_idxs = ~np.isnan(sliced_j).any(axis=(1,2))
    clean_E_idxs = ~np.isnan(sliced_E).any(axis=(1,2))
    clean_idxs = np.logical_and(clean_j_idxs, clean_E_idxs)
    print(f"Samples: {sliced_j.shape[0]}  Samples with no NaNs: {np.sum(clean_idxs)}")

    with h5py.File(outfile,'w') as file:
        file.create_dataset('B', data=sliced_B[clean_idxs])
        file.create_dataset('E', data=sliced_E[clean_idxs])
        file.create_dataset('j', data=sliced_j[clean_idxs])
        file.create_dataset('time', data=sliced_time[clean_idxs])
    
    i = loop_end(i)

Cleared output at step 194
/tigress/kendrab/analysis-notebooks/mms_data/mms_slices/2017-06-24T12-16-00_2017-06-25T09-57-00.h5 already exists. Skipping
/tigress/kendrab/analysis-notebooks/mms_data/mms_slices/2017-06-25T18-15-00_2017-06-26T02-13-00.h5 already exists. Skipping
/tigress/kendrab/analysis-notebooks/mms_data/mms_slices/2017-06-26T16-49-00_2017-06-26T19-42-30.h5 already exists. Skipping
Loading pydata/mms1/fpi/brst/l2/des-moms/2017/06/27/mms1_fpi_brst_l2_des-moms_20170627150053_v3.3.0.cdf
Loading pydata/mms1/fpi/brst/l2/des-moms/2017/06/27/mms1_fpi_brst_l2_des-moms_20170627150303_v3.3.0.cdf
Loading pydata/mms1/fpi/brst/l2/des-moms/2017/06/27/mms1_fpi_brst_l2_des-moms_20170627150513_v3.3.0.cdf
Loading pydata/mms1/fpi/brst/l2/des-moms/2017/06/27/mms1_fpi_brst_l2_des-moms_20170627150713_v3.3.0.cdf
Loading pydata/mms1/fpi/brst/l2/des-moms/2017/06/27/mms1_fpi_brst_l2_des-moms_20170627150923_v3.3.0.cdf
Loading pydata/mms1/fpi/brst/l2/des-moms/2017/06/27/mms1_fpi_brst_l2_des-moms_201