# DISTRIBUTED GRAPHS

In [1]:
import readfof
from pyspark.sql import SparkSession
import numpy as np
import scipy.spatial as SS

In [2]:
spark = SparkSession.builder \
        .master("spark://master:7077")\
        .appName("CosmoSparkApplication")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/29 14:40:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc = spark.sparkContext

In [4]:
# Read data
def read_cosmo_data(file_path):

    # Read Fof
    FoF = readfof.FoF_catalog(
        file_path,           # simulation directory
        2,                  # snapnum, indicating the redshift (z=1)
        long_ids = False,
        swap = False,
        SFR = False,
        read_IDs = False
        )

    return FoF

In [5]:
# get masses and positions

def get_pos_mass(FoF):

    pos = FoF.GroupPos/1e06             # Halo positions in Gpc/h 
    mass_raw = FoF.GroupMass * 1e10     # Halo masses in Msun/h

    dim = pos.shape[0]
    pos_mass_matrix = np.hstack([pos, mass_raw.reshape(dim,1)])

    return pos_mass_matrix

In [6]:
# simulations parameter

sim_pars_file = np.loadtxt("/mnt/cosmo_GNN/latin_hypercube_params.txt", dtype=float)

sim_pars_file.shape

(2000, 2)

In [7]:
# number of simulation to use
N_sims = 50

# path list (key, value)
path_list = [(i, "/mnt/cosmo_GNN/Data/"+str(i)) for i in range(N_sims)]

In [8]:
test_FoF = read_cosmo_data(path_list[0][1])

In [9]:
test_pos = test_FoF.GroupPos/1e06
test_masses = test_FoF.GroupMass*1e10

In [10]:
test_pos.shape

(62392, 3)

In [11]:
test_masses.shape

(62392,)

In [12]:
np.hstack([test_pos, test_masses.reshape(62392,1)]).shape

(62392, 4)

In [13]:
# FoF RDD
cosmo_rdd = sc.parallelize(path_list)\
            .mapValues(read_cosmo_data)

In [14]:
cosmo_rdd.count()

                                                                                

50

In [15]:
cosmo_rdd.getNumPartitions()

16

In [15]:
# array RDD
pos_mass_rdd = cosmo_rdd.mapValues(get_pos_mass)

In [18]:
cosa = pos_mass_rdd.take(100)

In [81]:
len(cosa)

2

In [82]:
type(cosa)

list

In [89]:
cosa[1][1].shape

(212944, 4)

In [19]:
cosa_pos = cosa[1][1][:,0:3]
cosa_mass = cosa[1][1][:,3]

cosa_cut = np.quantile(cosa_mass, 0.997)
cosa_mask = (cosa_mass >= cosa_cut)

In [20]:
cosa_pos[cosa_mask].shape

(641, 3)

In [21]:
cosa_mass[cosa_mask].shape

(641,)

In [22]:
cosa_dim = cosa_mass[cosa_mask].shape[0]
cosa_filtered = np.hstack([cosa_pos[cosa_mask], cosa_mass[cosa_mask].reshape(cosa_dim,1)])
cosa_filtered.shape

(641, 4)

In [23]:
# mass cut function
def mass_filter(pos_mass_matrix):

    mass = pos_mass_matrix[:,3]
    pos = pos_mass_matrix[:,0:3]
    cut = np.quantile(mass, 0.997)
    mask = (mass >= cut)
    mass_filtered = mass[mask]
    pos_filtered = pos[mask]

    dim = mass_filtered.shape[0]

    pos_mass_matrix_filtered = np.hstack([pos_filtered, mass_filtered.reshape(dim,1)])

    return pos_mass_matrix_filtered


In [24]:
filtered_rdd = pos_mass_rdd.mapValues(mass_filter)

In [25]:
filtered_rdd.take(100)[1][1].shape

(641, 4)

In [26]:
# get KDTree
def get_tree(pos):

    kd_tree = SS.KDTree(pos, leafsize=16, boxsize=1.0001)

    return kd_tree

In [27]:
kdtree_rdd = filtered_rdd.mapValues(lambda el: get_tree(el[:,0:3]))

In [28]:
kdtree_rdd.take(50)

[(0, <scipy.spatial._kdtree.KDTree at 0x7f6bbf6651d0>),
 (1, <scipy.spatial._kdtree.KDTree at 0x7f6bbf665450>),
 (2, <scipy.spatial._kdtree.KDTree at 0x7f6bbf665ed0>),
 (3, <scipy.spatial._kdtree.KDTree at 0x7f6bbc8818d0>),
 (4, <scipy.spatial._kdtree.KDTree at 0x7f6bbc881550>),
 (5, <scipy.spatial._kdtree.KDTree at 0x7f6bbf6415d0>),
 (6, <scipy.spatial._kdtree.KDTree at 0x7f6bbc882150>),
 (7, <scipy.spatial._kdtree.KDTree at 0x7f6bbf640f50>),
 (8, <scipy.spatial._kdtree.KDTree at 0x7f6bbf6664d0>),
 (9, <scipy.spatial._kdtree.KDTree at 0x7f6bbf6661d0>),
 (10, <scipy.spatial._kdtree.KDTree at 0x7f6bbf6665d0>),
 (11, <scipy.spatial._kdtree.KDTree at 0x7f6bbf667150>),
 (12, <scipy.spatial._kdtree.KDTree at 0x7f6bbf666b50>),
 (13, <scipy.spatial._kdtree.KDTree at 0x7f6bbf6671d0>),
 (14, <scipy.spatial._kdtree.KDTree at 0x7f6bbf667e50>),
 (15, <scipy.spatial._kdtree.KDTree at 0x7f6bbc8827d0>),
 (16, <scipy.spatial._kdtree.KDTree at 0x7f6bbf6658d0>),
 (17, <scipy.spatial._kdtree.KDTree at 0x

In [23]:
# get edge indexes

def get_edges(tree):

    edge_idx = tree.query_pairs(r=0.2, output_type="ndarray")

    return edge_idx

In [24]:
edge_idx_rdd = kdtree_rdd.mapValues(get_edges)

In [25]:
edge_idx_rdd.take(2)

[(0,
  array([[ 30, 176],
         [ 37, 176],
         [ 24, 176],
         ...,
         [ 90, 161],
         [ 90, 145],
         [145, 161]])),
 (1,
  array([[138, 283],
         [138, 562],
         [138, 431],
         ...,
         [476, 563],
         [563, 639],
         [476, 639]]))]

In [26]:
edg = np.array([[ 30, 176],
       [ 37, 176],
       [ 24, 176],
       [138, 283],
       [138, 562],
       [138, 431]])

row, col = edg.T

In [27]:
# add reverse pairs
def rev_pairs(edge_index_array):
    reversepairs = edge_index_array[:, [1,0]]
    edge_index_array_r = np.vstack([edge_index_array, reversepairs])
    # make sure indexes are integers
    return edge_index_array_r.astype(int)

In [28]:
edge_idx_rdd_r = edge_idx_rdd.mapValues(rev_pairs)

In [29]:
joined_rdd = filtered_rdd.join(edge_idx_rdd_r)

In [41]:
joined_rdd.take(1)[0][1][0]

array([[1.6756082e-01, 6.1475968e-01, 8.9784436e-02, 1.9548385e+15],
       [4.1062498e-01, 1.2807573e-01, 6.6436279e-01, 1.8625701e+15],
       [2.0109750e-01, 7.6894172e-02, 6.4360231e-01, 1.5365154e+15],
       ...,
       [2.8888595e-01, 3.7162909e-01, 4.1354087e-01, 3.3001012e+14],
       [4.2436123e-01, 1.2518100e-01, 3.4866741e-01, 3.3001012e+14],
       [6.2315601e-01, 1.3950281e-01, 3.4553781e-01, 3.3001009e+14]],
      dtype=float32)

In [42]:
# get edge features
def dist(joined_tuple):
    pos_mass = joined_tuple[1][0]
    edg_idx = joined_tuple[1][1]
    edg_idx = edg_idx.T
    pos = pos_mass[:, 0:3]
    row, col = edg_idx
    diff = pos[row]-pos[col]

    # boundary conditions
    diff_bc = np.where(diff < -0.01, diff + 1.0, diff)
    diff = np.where(diff > 0.01, diff - 1.0, diff_bc)

    # Get translational and rotational invariant features

    # Distance d = sqrt(dx^2+dy^2+dz^2)
    dist = np.linalg.norm(diff, axis=1)

    # unitdiff = diff/(dist.reshape(-1,1))    # for retrieving cos1 cos2

    # Normalize distance by linking radius
    dist /= 0.2

    return dist

def cos1(joined_tuple):
    pos_mass = joined_tuple[1][0]
    edg_idx = joined_tuple[1][1]
    edg_idx = edg_idx.T
    pos = pos_mass[:, 0:3]
    row, col = edg_idx
    
    # Centroid of halo catalogue (3d position of the centroid)
    centroid = np.mean(pos, axis=0)
    


In [43]:
dist_rdd = joined_rdd.map(dist)

In [29]:
dist_rdd.take(1)[0]

NameError: name 'dist_rdd' is not defined

In [30]:
sc.stop()
spark.stop()