In [167]:
import readfof
from pyspark.sql import SparkSession
import numpy as np
import scipy.spatial as SS
from scipy.spatial import KDTree

In [168]:
spark = SparkSession.builder \
        .master("spark://master:7077")\
        .appName("CosmoSparkApplication")\
        .getOrCreate()

In [169]:
sc = spark.sparkContext

In [170]:
def read_cosmo_data(file_path):

    # Read Fof
    FoF = readfof.FoF_catalog(
        file_path,           # simulation directory
        2,                   # snapnum, indicating the redshift (z=1)
        long_ids = False,
        swap = False,
        SFR = False,
        read_IDs = False
        )

    return FoF

# Get masses and positions from FoF
def get_pos_mass(FoF):

    pos = FoF.GroupPos/1e06             # Halo positions in Gpc/h 
    mass_raw = FoF.GroupMass * 1e10     # Halo masses in Msun/h

    dim = pos.shape[0]
    id = np.arange(dim, dtype=int).reshape(dim, 1)
    pos_mass_matrix = np.hstack([id, pos, mass_raw.reshape(dim, 1)])

    return pos_mass_matrix



# Mass cut function
def mass_filter(pos_mass_rdd, cut):
    mass = pos_mass_rdd[4]
    if mass >= cut:
        return pos_mass_rdd
    #cut = pos_mass_rdd.map(lambda x: np.quantile(x[4], quant))
    #return pos_mass_rdd.filter(lambda x: x[4] >= cut), cut
    
#def calculate_bounds(rdd):
#    """Calcola i limiti dello spazio tridimensionale (min e max per X, Y, Z)."""
#    # Estrai i minimi per X, Y, Z
#    min_coords = rdd.map(lambda x: (x[1], x[2], x[3]))\
#                    .reduce(lambda a, b: (min(a[0], b[0]),
#                                          min(a[1], b[1]),
#                                          min(a[2], b[2])))
#    
#    # Estrai i massimi per X, Y, Z
#    max_coords = rdd.map(lambda x: (x[1], x[2], x[3]))\
#                    .reduce(lambda a, b: (max(a[0], b[0]),
#                                          max(a[1], b[1]),
#                                          max(a[2], b[2])))
#    
#    return min_coords, max_coords

In [196]:
sim_pars_file = np.loadtxt("/mnt/cosmo_GNN/latin_hypercube_params.txt", dtype=float)

file_path = "/mnt/cosmo_GNN/Data/" + str(13)
test_FoF = read_cosmo_data(file_path)
pos_mass_array = get_pos_mass(test_FoF)

# mass cut
cut = np.quantile(pos_mass_array[:, 4], 0.997)

# parallelize and filter
pos_mass_rdd = sc.parallelize(pos_mass_array)

pos_mass_filtered = pos_mass_rdd.map(lambda x: mass_filter(x, cut))\
                                .filter(lambda x: x is not None)

In [197]:
cut

np.float64(291009306160011.7)

In [200]:
pos_mass_filtered.take(10)

[array([0.00000000e+00, 7.49109566e-01, 3.92418236e-01, 7.22306192e-01,
        1.34950329e+15]),
 array([1.00000000e+00, 5.21607637e-01, 8.49155128e-01, 7.33287215e-01,
        1.18272608e+15]),
 array([2.00000000e+00, 3.10260355e-01, 2.15019077e-01, 3.10398489e-02,
        1.08891796e+15]),
 array([3.00000000e+00, 5.58637619e-01, 8.15573931e-01, 4.94906664e-01,
        1.03434979e+15]),
 array([4.00000000e+00, 4.21333760e-01, 8.44022214e-01, 7.04580367e-01,
        1.03251041e+15]),
 array([5.00000000e+00, 1.06478073e-01, 8.92812669e-01, 2.68124908e-01,
        9.60161615e+14]),
 array([6.00000000e+00, 8.17335069e-01, 5.66343181e-02, 6.80305004e-01,
        9.57096015e+14]),
 array([7.00000000e+00, 3.42625111e-01, 9.14883912e-01, 2.00545162e-01,
        9.54643522e+14]),
 array([8.00000000e+00, 6.79631770e-01, 3.33230466e-01, 3.93127650e-01,
        9.50964748e+14]),
 array([9.00000000e+00, 3.52646112e-01, 8.74759078e-01, 9.50695932e-01,
        9.47290135e+14])]

In [201]:
#min_coords, max_coords = calculate_bounds(pos_mass_rdd)
min_x, min_y, min_z = 0, 0, 0 #min_coords
max_x, max_y, max_z = 1, 1, 1 #max_coords

r = 0.1  

# Compute the midpoint for every dimension
x_mid = np.mean([min_x, max_x])
y_mid = np.mean([min_y, max_y])
z_mid = np.mean([min_z, max_z])

boxes = {
    "box1": [(min_x    , x_mid + r ), (min_y    , y_mid + r), (min_z    , z_mid + r )],
    "box2": [(x_mid - r, max_x     ), (min_y    , y_mid + r), (min_z    , z_mid + r )],
    "box3": [(min_x    , x_mid + r ), (y_mid - r, max_y    ), (min_z    , z_mid + r )],
    "box4": [(x_mid - r, max_x     ), (y_mid - r, max_y    ), (min_z    , z_mid + r )],
    "box5": [(min_x    , x_mid + r ), (min_y    , y_mid + r), (z_mid - r, max_z    )],
    "box6": [(x_mid - r, max_x     ), (min_y    , y_mid + r), (z_mid - r, max_z    )],
    "box7": [(min_x    , x_mid + r ), (y_mid - r, max_y    ), (z_mid - r, max_z    )],
    "box8": [(x_mid - r, max_x     ), (y_mid - r, max_y    ), (z_mid - r, max_z    )],
}


In [202]:
# Assign each point to a box
def assign_box(point, boxes):
    id, x, y, z, m = point
    box_assign = []
    
    for box_name, ((x_min, x_max), (y_min, y_max), (z_min, z_max)) in boxes.items():
     if (x_min <= x <= x_max) and (y_min <= y <= y_max) and (z_min <= z <= z_max):
           box_assign.append((box_name, point))
    
    return box_assign

In [203]:
point_box_rdd = pos_mass_filtered.flatMap(lambda p: assign_box(p, boxes))


In [204]:
point_box_rdd.take(10)

[('box6',
  array([0.00000000e+00, 7.49109566e-01, 3.92418236e-01, 7.22306192e-01,
         1.34950329e+15])),
 ('box7',
  array([1.00000000e+00, 5.21607637e-01, 8.49155128e-01, 7.33287215e-01,
         1.18272608e+15])),
 ('box8',
  array([1.00000000e+00, 5.21607637e-01, 8.49155128e-01, 7.33287215e-01,
         1.18272608e+15])),
 ('box1',
  array([2.00000000e+00, 3.10260355e-01, 2.15019077e-01, 3.10398489e-02,
         1.08891796e+15])),
 ('box3',
  array([3.00000000e+00, 5.58637619e-01, 8.15573931e-01, 4.94906664e-01,
         1.03434979e+15])),
 ('box4',
  array([3.00000000e+00, 5.58637619e-01, 8.15573931e-01, 4.94906664e-01,
         1.03434979e+15])),
 ('box7',
  array([3.00000000e+00, 5.58637619e-01, 8.15573931e-01, 4.94906664e-01,
         1.03434979e+15])),
 ('box8',
  array([3.00000000e+00, 5.58637619e-01, 8.15573931e-01, 4.94906664e-01,
         1.03434979e+15])),
 ('box7',
  array([4.00000000e+00, 4.21333760e-01, 8.44022214e-01, 7.04580367e-01,
         1.03251041e+15])),
 

In [207]:
#punti_partizionati = punti_in_partizioni.groupByKey().mapValues(list)

boxes_rdd = point_box_rdd.groupByKey().mapValues(list)


In [213]:
boxes_rdd.keys().collect()

['box1', 'box5', 'box8', 'box6', 'box4', 'box2', 'box7', 'box3']

In [208]:
boxes_rdd.take(1)

24/08/31 12:06:40 WARN TaskSetManager: Stage 104 contains a task of very large size (1050 KiB). The maximum recommended task size is 1000 KiB.


[('box1',
  [array([2.00000000e+00, 3.10260355e-01, 2.15019077e-01, 3.10398489e-02,
          1.08891796e+15]),
   array([1.50000000e+01, 3.21817935e-01, 3.12683165e-01, 2.90357500e-01,
          8.62061609e+14]),
   array([3.00000000e+01, 7.56609738e-02, 2.64246106e-01, 1.14312276e-01,
          7.42502269e+14]),
   array([4.40000000e+01, 4.41150844e-01, 4.86439764e-01, 1.03641391e-01,
          6.79963519e+14]),
   array([4.50000000e+01, 2.74498075e-01, 2.20003620e-01, 4.95804042e-01,
          6.78124132e+14]),
   array([4.90000000e+01, 3.99768412e-01, 2.67936915e-01, 8.14262331e-02,
          6.68314158e+14]),
   array([5.10000000e+01, 5.37295699e-01, 3.17027926e-01, 4.17436182e-01,
          6.62795998e+14]),
   array([6.20000000e+01, 5.09309411e-01, 2.01037154e-01, 1.82046503e-01,
          6.26621569e+14]),
   array([6.50000000e+01, 5.61191499e-01, 1.34357795e-01, 3.42717379e-01,
          6.21718662e+14]),
   array([6.60000000e+01, 5.64165294e-01, 3.13291192e-01, 5.14565229e-01

In [260]:
def get_edges(pos_mass_points):
    pos_mass_matrix = np.array(pos_mass_points)
    pos = pos_mass_matrix[:,1:4]
    id = pos_mass_matrix[:,0]
    kd_tree = SS.KDTree(pos, leafsize=16, boxsize=1.00001)
    edge_idx = kd_tree.query_pairs(r=0.2, output_type="ndarray")
    edge_idx = np.array([sorted((id[i], id[j])) for i, j in edge_idx])
    return edge_idx



In [263]:
edges_rdd = boxes_rdd.mapValues(get_edges)

In [264]:
edges_rdd.take(4)

[('box1',
  array([[261., 266.],
         [266., 573.],
         [ 30., 266.],
         ...,
         [ 51., 571.],
         [571., 685.],
         [108., 571.]])),
 ('box5',
  array([[228., 607.],
         [228., 315.],
         [228., 481.],
         ...,
         [281., 679.],
         [166., 679.],
         [166., 281.]])),
 ('box8',
  array([[ 90., 500.],
         [ 90., 198.],
         [ 36.,  90.],
         ...,
         [323., 358.],
         [323., 746.],
         [358., 746.]])),
 ('box6',
  array([[381., 740.],
         [ 97., 740.],
         [ 89., 740.],
         ...,
         [465., 541.],
         [195., 541.],
         [195., 465.]]))]

In [278]:
edges_rdd.values().take(2)[1].shape

(972, 2)

In [279]:
def unique_pears(mat1, mat2):
    mat = np.hstack((mat1, mat2))
    return mat

In [284]:
unique_edges_rdd = edges_rdd.mapValues(lambda x: x)\
                            .reduce(lambda a, b: unique_pears(a, b))

24/08/31 13:39:14 WARN TaskSetManager: Lost task 2.0 in stage 222.0 (TID 922) (10.67.22.240 executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/rdd.py", line 1922, in func
    yield reduce(f, iterator, initial)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_61555/2651338800.py", line 2, in <lambda>
  File "/tmp/ipykernel_61555/1046498075.p

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 222.0 failed 4 times, most recent failure: Lost task 2.3 in stage 222.0 (TID 940) (10.67.22.240 executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/rdd.py", line 1922, in func
    yield reduce(f, iterator, initial)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_61555/2651338800.py", line 2, in <lambda>
  File "/tmp/ipykernel_61555/1046498075.py", line 2, in unique_pears
  File "/mnt/anaconda3/envs/pyspark_env/lib/python3.12/site-packages/numpy/_core/shape_base.py", line 357, in hstack
    arrs = atleast_1d(*tup)
           ^^^^^^^^^^^^^^^^
  File "/mnt/anaconda3/envs/pyspark_env/lib/python3.12/site-packages/numpy/_core/shape_base.py", line 70, in atleast_1d
    result = asanyarray(ary)
             ^^^^^^^^^^^^^^^
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at jdk.internal.reflect.GeneratedMethodAccessor243.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/rdd.py", line 1922, in func
    yield reduce(f, iterator, initial)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_61555/2651338800.py", line 2, in <lambda>
  File "/tmp/ipykernel_61555/1046498075.py", line 2, in unique_pears
  File "/mnt/anaconda3/envs/pyspark_env/lib/python3.12/site-packages/numpy/_core/shape_base.py", line 357, in hstack
    arrs = atleast_1d(*tup)
           ^^^^^^^^^^^^^^^^
  File "/mnt/anaconda3/envs/pyspark_env/lib/python3.12/site-packages/numpy/_core/shape_base.py", line 70, in atleast_1d
    result = asanyarray(ary)
             ^^^^^^^^^^^^^^^
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more


In [283]:
unique_edges_rdd

array(['box1',
       <pyspark.resultiterable.ResultIterable object at 0x7fe508cace60>,
       'box5',
       <pyspark.resultiterable.ResultIterable object at 0x7fe50de681d0>,
       'box8',
       <pyspark.resultiterable.ResultIterable object at 0x7fe50dd87500>,
       'box6',
       <pyspark.resultiterable.ResultIterable object at 0x7fe50e457380>,
       'box4',
       <pyspark.resultiterable.ResultIterable object at 0x7fe50dea9dc0>,
       'box2',
       <pyspark.resultiterable.ResultIterable object at 0x7fe50de6a450>,
       'box7',
       <pyspark.resultiterable.ResultIterable object at 0x7fe50de68dd0>,
       'box3',
       <pyspark.resultiterable.ResultIterable object at 0x7fe50de6b440>],
      dtype=object)

In [132]:
#partizioni_parallelizzate = punti_partizionati.partitionBy(punti_partizionati.count())

In [11]:
#def connessione_partizioni(partizione1, partizione2, r):
#    """
#    Trova i collegamenti tra i punti nelle zone di overlap tra due partizioni.
#    """
#    # Estrazione delle coordinate
#    coord1 = [(p[1], p[2], p[3]) for p in partizione1]
#    coord2 = [(p[1], p[2], p[3]) for p in partizione2]
#    
#    # Creazione dei KDTree per le due partizioni
#    tree1 = KDTree(coord1)
#    tree2 = KDTree(coord2)
#    
#    # Trova i punti di partizione1 che sono vicini a partizione2
#    edges = []
#    for i, point in enumerate(coord1):
#        # Trova tutti i punti in partizione2 entro distanza r da point in partizione1
#        indices = tree2.query_ball_point(point, r)
#        
#        for j in indices:
#            # Aggiungi un arco tra il punto di partizione1 e il punto corrispondente di partizione2
#            edges.append((partizione1[i], partizione2[j]))
#    
#    return edges
#
## Funzione per applicare la connessione in parallelo
#def connessione_in_partizione(iterator, r):
#    partizioni = list(iterator)
#    edges = []
#    
#    # Connetti i punti tra ogni coppia di partizioni
#    for i in range(len(partizioni)):
#        for j in range(i + 1, len(partizioni)):
#            nome_part1, punti1 = partizioni[i]
#            nome_part2, punti2 = partizioni[j]
#            edges.extend(connessione_partizioni(punti1, punti2, r))
#    
#    return iter(edges)




In [12]:
# Applica la connessione in parallelo
#edges_parallelizzati = partizioni_parallelizzate.mapPartitions(lambda iterator: connessione_in_partizione(iterator, r))


aaaaa§

In [13]:
# Riduci tutti gli archi ottenuti dalle connessioni parallele
#grafo_finale = edges_parallelizzati.reduce(lambda a, b: a + b)


Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/anaconda3/envs/pyspark_env/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 



In [None]:
# Visualizza il grafo finale
for edge in grafo_finale:
    print(edge)

In [15]:
sc.stop()
spark.stop()