## Dynamic nanoscale architecture of synaptic vesicle fusion in mouse hippocampal neurons
### Mesoscopic simulation of synaptic vesicle docking -- data pre-processing

Use this notebook to download and pre-processes the simulation trajectory files.

In [None]:
import glob
import urllib
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import pickle
from tqdm.notebook import tqdm

### Downloading trajectory data

The next cell will check if the HDF5 data files are already present in the ```./trajectory``` directory.
If not, they will be automatically downloaded from the public ftp repository at https://ftp.mi.fu-berlin.de/pub/msadeghi/synaptic_vesicle.

In [None]:
simulation_id = "2024_12_06"
n_syt1_list = [0, 10, 20, 30]
n_replicas = 5

hd5_file_name_list = []

for n_syt1 in tqdm(n_syt1_list):

    for replica_ind in range(1, n_replicas + 1):
    
        data_file_name = f"synaptic_vesicle_docking_n_syt1_{n_syt1}_replica_{replica_ind}_{simulation_id}.h5"
        data_file_path = f"./trajectory/{data_file_name}"

        print(f"Checking for {data_file_name}...")
        
        if not glob.glob(data_file_path):
    
            url = rf"https://ftp.mi.fu-berlin.de/pub/msadeghi/synaptic_vesicle/{data_file_name}"
            
            print(f"Downloading data file from {url}...")
        
            urllib.request.urlretrieve(url, data_file_path)
            
        else:
            
            print("Data file already exists!")
                
        hd5_file_name_list.append(data_file_path)

In [None]:
n_cases = len(n_syt1_list)

fig, ax = plt.subplots(nrows=n_cases, ncols=1, figsize=(5, 4 * n_cases))
plt.subplots_adjust(hspace=0.5)

dfs = []

for hd5_file_name in hd5_file_name_list:

    split_file_name = hd5_file_name.split('_')

    n_syt1 = int(split_file_name[split_file_name.index('syt1') + 1])
    replica_ind = int(split_file_name[split_file_name.index('replica') + 1])
    
    print(f"reading data for the simulation with {n_syt1} curvature-inducing proteins, replica = {replica_ind}")
    
    with h5py.File(hd5_file_name, 'r') as file:
        
        # Access a specific dataset
        dataset_parent = file['observables']

        data = {'time' : np.empty((0,)),
                'enthalpy' : np.empty((0,)),
                'en_non_bonded' : np.empty((0,)),
                'vesicle_target_gap' : np.empty((0,))}
    
        current_time = 0.0
        
        for series, dataset in dataset_parent.items():
            
            data['time'] = np.concatenate((data['time'], current_time + np.array(dataset['Time [ms]']) * 1.0e3)) # in microseconds
            
            current_time = data['time'][-1]
            
            data['en_non_bonded'] = np.concatenate((data['en_non_bonded'], np.array(dataset['Non-bonded Potential Energy [kJ per mol]'])))
            data['enthalpy'] = np.concatenate((data['enthalpy'], np.array(dataset['Enthalpy [kJ per mol]'])))
        
            data['vesicle_target_gap'] = np.concatenate((data['vesicle_target_gap'], np.array(dataset['Vesicle-target gap [nm]'])))

        dataset = file['particles']['group1']
        
        species_name_dict = dict(zip(file['parameters']['vmd_structure']['name'],
                                     file['parameters']['vmd_structure']['indexOfSpecies']))
        
        pos_list = np.array(dataset['position']['value'], copy=True, dtype=np.float64)
        type_list = np.array(dataset['species'], copy=True, dtype=np.float64)
        box_dim = np.array(dataset['box']['edges']['value'], copy=True, dtype=np.float64)
        
        selection = ((type_list == species_name_dict[b"MEMBRANE_1"]) +
                     (type_list == species_name_dict[b"MEMBRANE_2"]) +
                    (type_list == species_name_dict[b"Syt1"]) +
                    (type_list == species_name_dict[b"munc13_anchor"])+
                    (type_list == species_name_dict[b"t_SNARE_anchor"])+
                    (type_list == species_name_dict[b"v_SNARE_anchor"]))

        df = pd.DataFrame(data)

        df["n_syt1"] = n_syt1
        df["replica"] = replica_ind
        df["particle_types"] = [type_list] * (pos_list.shape[0] - 1)

        df['membrane_pos'] = [arr for arr in pos_list[1:, selection, :]]
        df['all_pos'] = [arr for arr in pos_list[1:, :, :]]
        df['box_dim'] = [arr for arr in np.diagonal(box_dim[1:, :], axis1=1, axis2=2)]

        dfs.append(df)

    ax_ind = n_syt1_list.index(n_syt1)

    ax[ax_ind].set_title(f"number of proteins = {n_syt1}")
    ax[ax_ind].plot(data['time'], data['vesicle_target_gap'], label=f'replica {replica_ind}')
    ax[ax_ind].set_xlabel(r"Time [$\mu$s]")
    ax[ax_ind].set_ylabel("Vesicle-target gap [nm]")
        
    ax[ax_ind].legend()
    
combined_df = pd.concat(dfs, ignore_index=True)

The necessary parts of the loaded data, which are accumulated into a pandas DataFrame, are pickled for further use in other notebooks:

In [None]:
combined_df.to_pickle("./trajectory/combined_data.pkl")