In [1]:
import readfof
from pyspark.sql import SparkSession
import numpy as np
import scipy.spatial as SS
from scipy.spatial import KDTree

In [2]:
spark = SparkSession.builder \
        .master("spark://master:7077")\
        .appName("CosmoSparkApplication")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/31 13:54:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc = spark.sparkContext

### Useful functions

In [4]:
def read_cosmo_data(file_path):

    # Read Fof
    FoF = readfof.FoF_catalog(
        file_path,           # simulation directory
        2,                   # snapnum, indicating the redshift (z=1)
        long_ids = False,
        swap = False,
        SFR = False,
        read_IDs = False
        )

    return FoF

# Get masses and positions from FoF
def get_pos_mass(FoF):

    pos = FoF.GroupPos/1e06             # Halo positions in Gpc/h 
    mass_raw = FoF.GroupMass * 1e10     # Halo masses in Msun/h

    dim = pos.shape[0]
    id = np.arange(dim, dtype=int).reshape(dim, 1)
    pos_mass_matrix = np.hstack([id, pos, mass_raw.reshape(dim, 1)])

    return pos_mass_matrix


# Mass cut function
def mass_filter(pos_mass_rdd, cut):
    mass = pos_mass_rdd[4]
    if mass >= cut:
        return pos_mass_rdd
    
# Assign each point to a box
def assign_box(point, boxes):
    id, x, y, z, m = point
    box_assign = []
    
    for box_name, ((x_min, x_max), (y_min, y_max), (z_min, z_max)) in boxes.items():
     if (x_min <= x <= x_max) and (y_min <= y <= y_max) and (z_min <= z <= z_max):
           box_assign.append((box_name, point))
    
    return box_assign

def get_edges(pos_mass_points):
    pos_mass_matrix = np.array(pos_mass_points)
    pos = pos_mass_matrix[:,1:4]
    id = pos_mass_matrix[:,0]

    kd_tree = SS.KDTree(pos, leafsize=16, boxsize=1.00001)
    edge_idx = kd_tree.query_pairs(r=0.2, output_type="ndarray")
    edge_idx = np.array([sorted((id[i], id[j])) for i, j in edge_idx])
    
    return edge_idx

In [5]:
sim_pars_file = np.loadtxt("/mnt/cosmo_GNN/latin_hypercube_params.txt", dtype=float)

file_path = "/mnt/cosmo_GNN/Data/" + str(13)
test_FoF = read_cosmo_data(file_path)
pos_mass_array = get_pos_mass(test_FoF)

# mass cut
cut = np.quantile(pos_mass_array[:, 4], 0.997)

# parallelize and filter
pos_mass_rdd = sc.parallelize(pos_mass_array)

pos_mass_filtered = pos_mass_rdd.map(lambda x: mass_filter(x, cut))\
                                .filter(lambda x: x is not None)

In [7]:
#min_coords, max_coords = calculate_bounds(pos_mass_rdd)
min_x, min_y, min_z = 0, 0, 0 #min_coords
max_x, max_y, max_z = 1, 1, 1 #max_coords

r = 0.1  

# Compute the midpoint for every dimension
x_mid = np.mean([min_x, max_x])
y_mid = np.mean([min_y, max_y])
z_mid = np.mean([min_z, max_z])

boxes = {
    "box1": [(min_x    , x_mid + r ), (min_y    , y_mid + r), (min_z    , z_mid + r )],
    "box2": [(x_mid - r, max_x     ), (min_y    , y_mid + r), (min_z    , z_mid + r )],
    "box3": [(min_x    , x_mid + r ), (y_mid - r, max_y    ), (min_z    , z_mid + r )],
    "box4": [(x_mid - r, max_x     ), (y_mid - r, max_y    ), (min_z    , z_mid + r )],
    "box5": [(min_x    , x_mid + r ), (min_y    , y_mid + r), (z_mid - r, max_z    )],
    "box6": [(x_mid - r, max_x     ), (min_y    , y_mid + r), (z_mid - r, max_z    )],
    "box7": [(min_x    , x_mid + r ), (y_mid - r, max_y    ), (z_mid - r, max_z    )],
    "box8": [(x_mid - r, max_x     ), (y_mid - r, max_y    ), (z_mid - r, max_z    )],
}


In [9]:
point_box_rdd = pos_mass_filtered.flatMap(lambda p: assign_box(p, boxes))

In [10]:
#punti_partizionati = punti_in_partizioni.groupByKey().mapValues(list)
boxes_rdd = point_box_rdd.groupByKey().mapValues(list)


In [12]:
edges_rdd = boxes_rdd.mapValues(get_edges)

In [264]:
edges_rdd.take(4)

[('box1',
  array([[261., 266.],
         [266., 573.],
         [ 30., 266.],
         ...,
         [ 51., 571.],
         [571., 685.],
         [108., 571.]])),
 ('box5',
  array([[228., 607.],
         [228., 315.],
         [228., 481.],
         ...,
         [281., 679.],
         [166., 679.],
         [166., 281.]])),
 ('box8',
  array([[ 90., 500.],
         [ 90., 198.],
         [ 36.,  90.],
         ...,
         [323., 358.],
         [323., 746.],
         [358., 746.]])),
 ('box6',
  array([[381., 740.],
         [ 97., 740.],
         [ 89., 740.],
         ...,
         [465., 541.],
         [195., 541.],
         [195., 465.]]))]

In [278]:
edges_rdd.values().take(2)[1].shape

(972, 2)

In [17]:
def unique_pears(mat1, mat2):
    mat = np.vstack((mat1, mat2))
    return np.unique(mat, axis=0)

In [18]:
rdd_no_key= edges_rdd.map(lambda x: x[1]).reduce(lambda a, b: unique_pears(a, b))    #.flatMap(lambda x: x)\
                            

In [19]:
rdd_no_key

array([[  0.,  10.],
       [  0.,  16.],
       [  0.,  33.],
       ...,
       [727., 738.],
       [731., 741.],
       [737., 745.]])

In [132]:
#partizioni_parallelizzate = punti_partizionati.partitionBy(punti_partizionati.count())

In [11]:
#def connessione_partizioni(partizione1, partizione2, r):
#    """
#    Trova i collegamenti tra i punti nelle zone di overlap tra due partizioni.
#    """
#    # Estrazione delle coordinate
#    coord1 = [(p[1], p[2], p[3]) for p in partizione1]
#    coord2 = [(p[1], p[2], p[3]) for p in partizione2]
#    
#    # Creazione dei KDTree per le due partizioni
#    tree1 = KDTree(coord1)
#    tree2 = KDTree(coord2)
#    
#    # Trova i punti di partizione1 che sono vicini a partizione2
#    edges = []
#    for i, point in enumerate(coord1):
#        # Trova tutti i punti in partizione2 entro distanza r da point in partizione1
#        indices = tree2.query_ball_point(point, r)
#        
#        for j in indices:
#            # Aggiungi un arco tra il punto di partizione1 e il punto corrispondente di partizione2
#            edges.append((partizione1[i], partizione2[j]))
#    
#    return edges
#
## Funzione per applicare la connessione in parallelo
#def connessione_in_partizione(iterator, r):
#    partizioni = list(iterator)
#    edges = []
#    
#    # Connetti i punti tra ogni coppia di partizioni
#    for i in range(len(partizioni)):
#        for j in range(i + 1, len(partizioni)):
#            nome_part1, punti1 = partizioni[i]
#            nome_part2, punti2 = partizioni[j]
#            edges.extend(connessione_partizioni(punti1, punti2, r))
#    
#    return iter(edges)




In [12]:
# Applica la connessione in parallelo
#edges_parallelizzati = partizioni_parallelizzate.mapPartitions(lambda iterator: connessione_in_partizione(iterator, r))


aaaaa§

In [13]:
# Riduci tutti gli archi ottenuti dalle connessioni parallele
#grafo_finale = edges_parallelizzati.reduce(lambda a, b: a + b)


Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/anaconda3/envs/pyspark_env/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 



In [None]:
# Visualizza il grafo finale
for edge in grafo_finale:
    print(edge)

In [298]:
sc.stop()
spark.stop()