# Extract subsets of the data

This will extract subsets of the data for easier handling while working on the methods.

## Set parameters

Let's make sure that this is the only place where users have to change contents.

In [2]:
# parameters
platform = "jsc_scratch"
use_first_N = 10_000

## Create Dask Cluster

We'll parallelize computation on a single compute node for now.

In [3]:
import dask
from dask.distributed import Client, wait

# Make sure the Dask dashboard is easy to reach
dask.config.set(
    {
        'distributed.dashboard.link':
        "{JUPYTERHUB_BASE_URL}user/{JUPYTERHUB_USER}/{JUPYTERHUB_SERVER_NAME}/proxy/{port}/status"
    }
)

# start a Dask cluster that spans a whole node and is highly parallelized
# Client passes kwargs to dask.distributed.LocalCluster
client = Client(n_workers=1, threads_per_worker=64, memory_limit=240e9)
client

0,1
Client  Scheduler: tcp://127.0.0.1:39670  Dashboard: /user/wrath@geomar.de/jupyterlab_1/proxy/8787/status,Cluster  Workers: 1  Cores: 64  Memory: 240.00 GB


👆 _**Don't forget to click on the Dashboard link above!**_

## Open the data catalog

In [4]:
import intake  # data catalogs

In [5]:
catalog = intake.open_catalog(f"../intake-catalogs/medseaconnectivity_{platform}.yaml")
print(list(catalog))

['medsea-trajectories-stokes', 'medsea-trajectories']


## Load both datasets

In [6]:
ds_stokes = catalog['medsea-trajectories-stokes'].to_dask()
display(ds_stokes)

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.10 GB 384.80 MB Shape (2625480, 962) (100000, 962) Count 28 Tasks 27 Chunks Type float32 numpy.ndarray",962  2625480,

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.10 GB 384.80 MB Shape (2625480, 962) (100000, 962) Count 28 Tasks 27 Chunks Type float32 numpy.ndarray",962  2625480,

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.10 GB 384.80 MB Shape (2625480, 962) (100000, 962) Count 28 Tasks 27 Chunks Type float32 numpy.ndarray",962  2625480,

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.10 GB 384.80 MB Shape (2625480, 962) (100000, 962) Count 28 Tasks 27 Chunks Type float32 numpy.ndarray",962  2625480,

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.10 GB 384.80 MB Shape (2625480, 962) (100000, 962) Count 28 Tasks 27 Chunks Type float32 numpy.ndarray",962  2625480,

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.10 GB 384.80 MB Shape (2625480, 962) (100000, 962) Count 28 Tasks 27 Chunks Type float32 numpy.ndarray",962  2625480,

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.21 GB,769.60 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 20.21 GB 769.60 MB Shape (2625480, 962) (100000, 962) Count 28 Tasks 27 Chunks Type datetime64[ns] numpy.ndarray",962  2625480,

Unnamed: 0,Array,Chunk
Bytes,20.21 GB,769.60 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.10 GB 384.80 MB Shape (2625480, 962) (100000, 962) Count 28 Tasks 27 Chunks Type float32 numpy.ndarray",962  2625480,

Unnamed: 0,Array,Chunk
Bytes,10.10 GB,384.80 MB
Shape,"(2625480, 962)","(100000, 962)"
Count,28 Tasks,27 Chunks
Type,float32,numpy.ndarray


In [7]:
ds_nostokes = catalog['medsea-trajectories'].to_dask()
display(ds_nostokes)

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 50.75 GB 384.80 MB Shape (13188600, 962) (100000, 962) Count 133 Tasks 132 Chunks Type float32 numpy.ndarray",962  13188600,

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 50.75 GB 384.80 MB Shape (13188600, 962) (100000, 962) Count 133 Tasks 132 Chunks Type float32 numpy.ndarray",962  13188600,

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 50.75 GB 384.80 MB Shape (13188600, 962) (100000, 962) Count 133 Tasks 132 Chunks Type float32 numpy.ndarray",962  13188600,

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 50.75 GB 384.80 MB Shape (13188600, 962) (100000, 962) Count 133 Tasks 132 Chunks Type float32 numpy.ndarray",962  13188600,

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 50.75 GB 384.80 MB Shape (13188600, 962) (100000, 962) Count 133 Tasks 132 Chunks Type float32 numpy.ndarray",962  13188600,

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 50.75 GB 384.80 MB Shape (13188600, 962) (100000, 962) Count 133 Tasks 132 Chunks Type float32 numpy.ndarray",962  13188600,

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,101.50 GB,769.60 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 101.50 GB 769.60 MB Shape (13188600, 962) (100000, 962) Count 133 Tasks 132 Chunks Type datetime64[ns] numpy.ndarray",962  13188600,

Unnamed: 0,Array,Chunk
Bytes,101.50 GB,769.60 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 50.75 GB 384.80 MB Shape (13188600, 962) (100000, 962) Count 133 Tasks 132 Chunks Type float32 numpy.ndarray",962  13188600,

Unnamed: 0,Array,Chunk
Bytes,50.75 GB,384.80 MB
Shape,"(13188600, 962)","(100000, 962)"
Count,133 Tasks,132 Chunks
Type,float32,numpy.ndarray


## Subset the data

Let's go for just the surface data (`z==z.min()`) and some MPA (number `3`) as a starting point.

In [8]:
df_stokes = ds_stokes.isel(traj=slice(0, use_first_N)).to_dataframe()
df_stokes

Unnamed: 0_level_0,Unnamed: 1_level_0,MPA,distance,land,lat,lon,temp,time,z
obs,traj,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,1.0,0.000000,0.0,43.288517,5.171321,13.421764,2017-03-01,1.018237
0,1,1.0,0.000000,0.0,43.281944,5.276339,13.471125,2017-03-01,1.018237
0,2,1.0,0.000000,0.0,43.297012,5.214373,13.423952,2017-03-01,1.018237
0,3,1.0,0.000000,0.0,43.292927,5.178279,13.417353,2017-03-01,1.018237
0,4,1.0,0.000000,0.0,43.289921,5.143188,13.385047,2017-03-01,1.018237
...,...,...,...,...,...,...,...,...,...
961,9995,0.0,582.344666,0.0,40.239780,4.320971,15.917730,2017-04-29,1.018237
961,9996,0.0,583.722717,0.0,40.392910,6.809975,15.733720,2017-04-29,1.018237
961,9997,0.0,316.231628,0.0,41.334206,5.974724,15.190663,2017-04-29,1.018237
961,9998,0.0,584.091064,0.0,40.648731,4.662280,15.443208,2017-04-29,1.018237


In [9]:
df_stokes.to_csv(f"df_stokes_first_{use_first_N}_traj.csv")

In [10]:
df_nostokes = ds_nostokes.isel(traj=slice(0, use_first_N)).to_dataframe()
df_nostokes

Unnamed: 0_level_0,Unnamed: 1_level_0,MPA,distance,land,lat,lon,temp,time,z
obs,traj,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,1.0,0.000000,0.000000,43.310062,5.194578,13.392999,2017-03-01,1.018237
0,1,1.0,0.000000,0.000000,43.270927,5.283075,13.488984,2017-03-01,1.018237
0,2,1.0,0.000000,0.000000,43.308865,5.225371,13.404660,2017-03-01,1.018237
0,3,1.0,0.000000,0.000000,43.301994,5.179939,13.402393,2017-03-01,1.018237
0,4,1.0,0.000000,0.000000,43.279408,5.155460,13.414937,2017-03-01,1.018237
...,...,...,...,...,...,...,...,...,...
961,9995,0.0,460.234100,0.000000,42.170036,3.523023,22.223366,2017-07-20,10.536604
961,9996,0.0,148.784012,0.000000,43.107277,4.870928,17.185507,2017-07-21,10.536604
961,9997,1.0,19.152134,0.994507,43.353939,5.183627,0.115257,2017-07-21,10.536604
961,9998,0.0,139.117233,0.000000,43.307251,4.061980,19.593737,2017-07-21,10.536604


In [11]:
df_nostokes.to_csv(f"df_nostokes_first_{use_first_N}_traj.csv")

## Move to central store

In [17]:
!mkdir -p /p/home/jusers/rath1/jusuf/PROJECT_training2005/geomar_challenge/data/med_sea_connectivity_v2020.11.04.1/.
!mv -v df_nostokes_first_*_traj.csv df_stokes_first_*_traj.csv /p/home/jusers/rath1/jusuf/PROJECT_training2005/geomar_challenge/data/med_sea_connectivity_v2020.11.04.1/.

‘df_nostokes_first_10000_traj.csv’ -> ‘/p/home/jusers/rath1/jusuf/PROJECT_training2005/geomar_challenge/data/med_sea_connectivity_v2020.11.04.1/./df_nostokes_first_10000_traj.csv’
‘df_stokes_first_10000_traj.csv’ -> ‘/p/home/jusers/rath1/jusuf/PROJECT_training2005/geomar_challenge/data/med_sea_connectivity_v2020.11.04.1/./df_stokes_first_10000_traj.csv’


## Read the data

In [13]:
import pandas as pd

In [19]:
pd.read_csv(f"/p/home/jusers/rath1/jusuf/PROJECT_training2005/geomar_challenge/data/med_sea_connectivity_v2020.11.04.1/df_stokes_first_{use_first_N}_traj.csv")

Unnamed: 0,obs,traj,MPA,distance,land,lat,lon,temp,time,z
0,0,0,1.0,0.00000,0.0,43.288517,5.171321,13.421764,2017-03-01 00:00:00,1.018237
1,0,1,1.0,0.00000,0.0,43.281944,5.276339,13.471125,2017-03-01 00:00:00,1.018237
2,0,2,1.0,0.00000,0.0,43.297012,5.214373,13.423952,2017-03-01 00:00:00,1.018237
3,0,3,1.0,0.00000,0.0,43.292927,5.178279,13.417353,2017-03-01 00:00:00,1.018237
4,0,4,1.0,0.00000,0.0,43.289920,5.143187,13.385047,2017-03-01 00:00:00,1.018237
...,...,...,...,...,...,...,...,...,...,...
9619995,961,9995,0.0,582.34467,0.0,40.239780,4.320971,15.917730,2017-04-29,1.018237
9619996,961,9996,0.0,583.72270,0.0,40.392910,6.809975,15.733720,2017-04-29,1.018237
9619997,961,9997,0.0,316.23163,0.0,41.334206,5.974724,15.190663,2017-04-29,1.018237
9619998,961,9998,0.0,584.09106,0.0,40.648730,4.662280,15.443208,2017-04-29,1.018237


In [20]:
pd.read_csv(f"/p/home/jusers/rath1/jusuf/PROJECT_training2005/geomar_challenge/data/med_sea_connectivity_v2020.11.04.1/df_nostokes_first_{use_first_N}_traj.csv")

Unnamed: 0,obs,traj,MPA,distance,land,lat,lon,temp,time,z
0,0,0,1.0,0.000000,0.000000,43.310062,5.194578,13.392999,2017-03-01 00:00:00,1.018237
1,0,1,1.0,0.000000,0.000000,43.270927,5.283075,13.488984,2017-03-01 00:00:00,1.018237
2,0,2,1.0,0.000000,0.000000,43.308865,5.225371,13.404660,2017-03-01 00:00:00,1.018237
3,0,3,1.0,0.000000,0.000000,43.301994,5.179939,13.402393,2017-03-01 00:00:00,1.018237
4,0,4,1.0,0.000000,0.000000,43.279408,5.155460,13.414937,2017-03-01 00:00:00,1.018237
...,...,...,...,...,...,...,...,...,...,...
9619995,961,9995,0.0,460.234100,0.000000,42.170036,3.523023,22.223366,2017-07-20,10.536604
9619996,961,9996,0.0,148.784010,0.000000,43.107277,4.870928,17.185507,2017-07-21,10.536604
9619997,961,9997,1.0,19.152134,0.994507,43.353940,5.183627,0.115257,2017-07-21,10.536604
9619998,961,9998,0.0,139.117230,0.000000,43.307250,4.061980,19.593737,2017-07-21,10.536604
