# Distributed processing to obtain graphs from all simulations

In [1]:
import readfof
from pyspark.sql import SparkSession
import numpy as np
import matplotlib.pyplot as plt
import math

### Spark cluster

In [2]:
spark = SparkSession.builder \
        .master("spark://master:7077")\
        .appName("CosmoSparkApplication")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/04 10:18:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc = spark.sparkContext

### Useful functions

In [4]:
# Read data
def read_cosmo_data(file_path):

    # Read Fof
    FoF = readfof.FoF_catalog(
        file_path,           # simulation directory
        2,                   # snapnum, indicating the redshift (z=1)
        long_ids = False,
        swap = False,
        SFR = False,
        read_IDs = False
        )

    return FoF


# Get masses and positions from FoF
def get_pos_mass(FoF):

    pos = FoF.GroupPos/1e06             # Halo positions in Gpc/h 
    mass_raw = FoF.GroupMass * 1e10     # Halo masses in Msun/h

    dim = pos.shape[0]
    pos_mass_matrix = np.hstack([pos, mass_raw.reshape(dim, 1)])

    return pos_mass_matrix

# To assign simulation keys to each point in each simulation
def assign_key_to_rows(key_value_pair):
    key, array = key_value_pair
    return [(key, row) for row in array]


# Plot a graph in 3D space
def plot_graph_3D(num, pars_file, pos, masses, edge_idx):

    fig = plt.figure(figsize=(10, 10))
    fontsize = 12

    ax = fig.add_subplot(projection ="3d")

    pos = np.array(pos, dtype=float) * 1.e3   # show in Mpc

    # Draw lines for each edge
    for (src, dst) in edge_idx: #.t().tolist():

        src = pos[int(src)].tolist()
        dst = pos[int(dst)].tolist()

        ax.plot([src[0], dst[0]], [src[1], dst[1]], zs=[src[2], dst[2]], linewidth=0.6, color='dimgrey')

    # Plot nodes
    mass_mean = np.mean(masses)
    for i,m in enumerate(masses):
            ax.scatter(pos[i, 0], pos[i, 1], pos[i, 2], s=50*m*m/(mass_mean**2), zorder=1000, alpha=0.6, color = 'mediumpurple')

    ax.xaxis.set_tick_params(labelsize=fontsize)
    ax.yaxis.set_tick_params(labelsize=fontsize)
    ax.zaxis.set_tick_params(labelsize=fontsize)

    ax.set_xlabel('x (Mpc)', fontsize=16, labelpad=15)
    ax.set_ylabel('y (Mpc)', fontsize=16, labelpad=15)
    ax.set_zlabel('z (Mpc)', fontsize=16, labelpad=15)

    rl = '$R_{link} = 0.2$'

    pars_file = pars_file[num]

    ax.set_title(f'\tGraph n°{num}, Masses $\\geq 99.7$% percentile, {rl} Mpc \t \n \n $\\Omega_m = {float(pars_file[0]):.3f}$ \t $\\sigma_8 = {float(pars_file[1]):.3f}$', fontsize=20)

    plt.show()


# Graph object
class graph:

    def __init__(self, node_f, pos, sim_pars, glob_f, edge_idx, edge_f):
        
        self.node_f = node_f
        self.pos = pos
        self.sim_pars = sim_pars
        self.glob_f = glob_f
        self.edge_idx = edge_idx
        self.edge_f = edge_f

### Read simulations and parallelize data

In [5]:
# simulations parameter
sim_pars_file = np.loadtxt("/mnt/cosmo_GNN/latin_hypercube_params.txt", dtype=float)

# number of simulations to be processed
n_sims = 10

# path list with simulation keys
path_list = [(i, "/mnt/cosmo_GNN/Data/" + str(i)) for i in range(n_sims)]

# parallelize path list and read files
fof_rdd = sc.parallelize(path_list)\
            .mapValues(read_cosmo_data)

# get positions and masses for each point
pos_mass_rdd = fof_rdd.mapValues(get_pos_mass)\
                      .flatMap(assign_key_to_rows)

# cut percentile
cut = 0.997

# get mass cuts 
mass_cut_rdd = fof_rdd.mapValues(get_pos_mass)\
                      .mapValues(lambda x: np.quantile(x[:, -1], cut))

mass_cuts = mass_cut_rdd.values().collect()
mass_cuts = np.array(mass_cuts)

# filter by mass
pos_mass_rdd_filtered = pos_mass_rdd.filter(lambda x: x[1][-1] >= mass_cuts[x[0]])

# number of halos in each simulation
n_halos = pos_mass_rdd_filtered.countByKey()

                                                                                

In [6]:
pos_mass_rdd_filtered.take(3)

[(0,
  array([6.7691261e-01, 6.6048630e-02, 2.6853576e-01, 6.3511346e+14],
        dtype=float32)),
 (0,
  array([1.6845600e-01, 6.2195367e-01, 3.8025469e-01, 4.6018135e+14],
        dtype=float32)),
 (0,
  array([3.1951472e-01, 1.5889315e-01, 3.7413445e-01, 4.3477326e+14],
        dtype=float32))]

In [7]:
n_halos

defaultdict(int,
            {0: 188,
             1: 641,
             2: 248,
             3: 517,
             4: 449,
             5: 681,
             6: 220,
             7: 222,
             8: 478,
             9: 179})

### Clustering phase to get linked halos

In [51]:
# masses rdd
mass_rdd = pos_mass_rdd_filtered.mapValues(lambda x: x[:3])

# positions rdd
pos_rdd = pos_mass_rdd_filtered.mapValues(lambda x: x[:3])

# indexed positions rdd
idx_pos_rdd = pos_rdd.zipWithIndex()\
                     .map(lambda x: (x[1], x[0]))

#cartesian product between the positions rdd and itself (to obtain all the possible pairs)
cartesian_rdd = idx_pos_rdd.cartesian(idx_pos_rdd)

# compute differences between every pair
diff_rdd = cartesian_rdd.map(lambda x: (x[0][0], x[0][1][0], x[1][1][0], np.abs(x[0][1][1][1] - x[1][1][1])))

# compute distances between every pair
# pairs_dist_rdd = diff_rdd.map(lambda x: (x[0], x[1], np.linalg.norm(x[2])))

# # pairs and distances rdd filtered (by linking radius)
# linked_pairs_dist_rdd = pairs_dist_rdd.filter(lambda x: x[2] <= 0.2)

# # pairs rdd
# pairs_rdd = linked_pairs_dist_rdd.map(lambda x: (x[0], x[1]))

# # distances rdd 
# distances_rdd = linked_pairs_dist_rdd.map(lambda x: x[2])

In [53]:
cartesian_rdd.take(3)

                                                                                

[((0, (0, array([0.6769126 , 0.06604863, 0.26853576], dtype=float32))),
  (0, (0, array([0.6769126 , 0.06604863, 0.26853576], dtype=float32)))),
 ((0, (0, array([0.6769126 , 0.06604863, 0.26853576], dtype=float32))),
  (1, (0, array([0.168456  , 0.62195367, 0.3802547 ], dtype=float32)))),
 ((0, (0, array([0.6769126 , 0.06604863, 0.26853576], dtype=float32))),
  (2, (0, array([0.31951472, 0.15889315, 0.37413445], dtype=float32))))]

In [52]:
diff_rdd.take(5)

                                                                                

[(0, 0, 0, array([0.610864  , 0.        , 0.20248714], dtype=float32)),
 (0, 0, 0, array([0.10240737, 0.55590504, 0.31420606], dtype=float32)),
 (0, 0, 0, array([0.2534661 , 0.09284452, 0.30808583], dtype=float32)),
 (0, 0, 0, array([0.47995615, 0.7942723 , 0.23586324], dtype=float32)),
 (0, 0, 0, array([0.1561198 , 0.61533946, 0.6963896 ], dtype=float32))]

### A graph 3D plot

### Retrieving edge features

### Global features and graph objects creation

### Stop Spark Context and Master

In [98]:
sc.stop()
spark.stop()