In [2]:
# general imports
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import os
import pandas as pd
import time

from functools import singledispatch

# dataset
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import StandardScaler

# pyspark module
from pyspark.rdd import RDD

# src module
from src.utils import kddSetup, sparkSetup
from src.kmeans import compute_centroidDistances, get_clusterId, get_minDistance, cost_function, kMeansParallel_init, kMeansPlusPlus_init, kMeansRandom_init, miniBatchKMeans, naiveKMeans

## Start a Spark session (RUN ONLY ONCE)

In [3]:
# creating the zipped environment if it doesn't already exist
! if [ ! -f "environment.tar.gz" ]; then venv-pack -o "environment.tar.gz" ; fi
# creating the zipped module src
! if [ -f "src.tar.gz" ]; then rm src.tar.gz ; fi
! tar -czf src.tar.gz src

In [4]:
# starting the cluster
! $SPARK_HOME/sbin/start-all.sh

starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.master.Master-1-mapd-b-14-1.out
worker3: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-4.out
worker1: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-2.out
worker2: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-3.out
master: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-mapd-b-14-1.out


In [5]:
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"

## Acess Spark Application

In [6]:
# creating a sparkSession
spark = sparkSetup("kMeans")
sc = spark.sparkContext
# exporting the src module
sc.addPyFile("src.tar.gz") # telling spark where to find the python binary

25/09/10 20:19:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 20:19:31 WARN Utils: Untarring behavior will be deprecated at spark.files and SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive instead.


## Load and preprocess the dataset

The first dataset we would like to test is a synthetic GaussMixture. To generate it, we sampled kcenters from a 15-dimensional spherical Gaussian distribution with mean at the origin and variance R∈{1,10,100}. We then added points from Gaussian distributions of unit variance around each center. Given the k centers, this is a mixture of k spherical Gaussians with equal weights.

ref{paper kmeans||}

#### TESTS

In [7]:
def gauss_mixture(
    n: int=50,  
    k: int=10,
    dim: int=15,                  
    R: int=10
) -> RDD:
    seed = 42
    np.random.seed(seed)
    # Centers generation N(0, R*I)
    centers = np.random.normal(loc=0, scale=np.sqrt(R), size=(k, dim))
    # Point generation N(center, I) for each cluster
    return np.concatenate(
        [center + np.random.randn(n, dim) for center in centers],
        axis=0
    )

In [7]:
gm_narray = gauss_mixture(n=50, k=10, dim=15, R=10)
gm_df = pd.DataFrame(gm_narray)
gm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       500 non-null    float64
 1   1       500 non-null    float64
 2   2       500 non-null    float64
 3   3       500 non-null    float64
 4   4       500 non-null    float64
 5   5       500 non-null    float64
 6   6       500 non-null    float64
 7   7       500 non-null    float64
 8   8       500 non-null    float64
 9   9       500 non-null    float64
 10  10      500 non-null    float64
 11  11      500 non-null    float64
 12  12      500 non-null    float64
 13  13      500 non-null    float64
 14  14      500 non-null    float64
dtypes: float64(15)
memory usage: 58.7 KB


In [8]:
gm_rdd = sc.parallelize([row for row in gm_narray]).persist()
gm_rdd \
    .map(lambda x: (x)) \
    .count()

                                                                                

500

In [9]:
#################### NAIVE ####################
data_rdd = gm_rdd
k = 10
l = 0.5*k
r = 0
iterations = 10
batch_fraction = 0.1
results = []

start = time.time()
centroidsParallel1 = kMeansParallel_init(data_rdd, k, l, r)
init_time = time.time() - start
start = time.time()
seed_cost = cost_function(data_rdd, centroidsParallel1)
final_centroids = naiveKMeans(data_rdd, centroidsParallel1, iterations)
algo_time = time.time() - init_time

final_cost = cost_function(data_rdd, final_centroids)

results.append({
    "method": "(naive)kmeans|| (l=k/2)",
    "k": k,
    "l": 0.5*k,
    "r": r, 
    "initialization_time (s)": f"{init_time:.3f}",
    "algo time (s)":f"{algo_time:.3f}",
    "seed": seed_cost,
    "final": final_cost
})
results = pd.DataFrame(results)
results

Unnamed: 0,method,k,l,r,initialization_time (s),algo time (s),seed,final
0,(naive)kmeans|| (l=k/2),10,5.0,0,3.89,1757532601.255,8498.847886,7492.558746


In [10]:
#################### MINI BATCH ####################
data_rdd = gm_rdd
k = 10
l = 0.5*k
r = 0
iterations = 10
batch_fraction = 0.1
results = []

# start = time.time()
# centroidsParallel1 = kMeansParallel_init(data_rdd, k, l, r)
# init_time = time.time() - start
# start = time.time()
# seed_cost = cost_function(data_rdd, centroidsParallel1)
# final_centroids = miniBatchKMeans(data_rdd, centroidsParallel1, iterations, batch_fraction)
# algo_time = time.time() - start
# final_cost = cost_function(data_rdd, final_centroids)

# results.append({
#     "method": "(mini)kmeans|| (l=k/2)",
#     "k": k,
#     "l": 0.5*k,
#     "r": r, 
#     "initialization_time (s)":f"{init_time:.3f}",
#     "algo time (s)":f"{algo_time:.3f}",
#     "seed": seed_cost,
#     "final": final_cost
# })
results = pd.DataFrame(results)
results

In [11]:
#################### RANDOM ####################
data_rdd = gm_rdd
k = 10
l = 0.5*k
r = 0
iterations = 10
batch_fraction = 0.1
results = []
algo = "naive"

######## 1 - Random Init ########
start = time.time()
######## Init ########
centroidsRandom = gm_narray[np.random.choice(gm_narray.shape[0], size=k, replace=False)]
init_time = time.time() - start
seed_cost = cost_function(data_rdd, centroidsRandom)
start = time.time()
######## Algo ########
if algo=="naive": final_centroids = naiveKMeans(data_rdd, centroidsRandom, iterations)
else: final_centroids = miniBatchKMeans(data_rdd, centroidsRandom, iterations, batch_fraction)
algo_time = time.time() - start
final_cost = cost_function(data_rdd, final_centroids)

results.append({
    "method": "random",
    "algo": algo,
    "k": k, 
    "l": np.nan,
    "r": r, 
    "initialization_time": init_time,
    "algo time": algo_time,
    "seed": seed_cost,
    "final": final_cost
})

results = pd.DataFrame(results)
results

Unnamed: 0,method,algo,k,l,r,initialization_time,algo time,seed,final
0,random,naive,10,,0,0.000189,3.568637,45480.895055,15187.011284


In [None]:
#################### ++ ####################
data_rdd = gm_rdd
k = 10
l = 0.5*k
r = 0
iterations = 10
batch_fraction = 0.1
results = []
algo = "naive"

######## 2 - k-means++ ########
start = time.time()
######## Init ########
centroidsPlusPlus = kMeansPlusPlus_init(gm_narray, k)
init_time = time.time() - start
seed_cost = cost_function(data_rdd, centroidsPlusPlus)
######## Algo ########
start = time.time()
if algo=="naive": final_centroids = naiveKMeans(data_rdd, centroidsPlusPlus, iterations)
else: final_centroids = miniBatchKMeans(data_rdd, centroidsPlusPlus, iterations, batch_fraction)
algo_time = time.time() - start
final_cost = cost_function(data_rdd, final_centroids)

results.append({
    "method": "k-means++",
    "algo": algo,
    "k": k,
    "l": 0.5*k,
    "r": r, 
    "initialization_time (s)": f"{init_time:.3f}",
    "algo time (s)":f"{algo_time:.3f}",
    "seed": seed_cost,
    "final": final_cost
})

results = pd.DataFrame(results)
results

Unnamed: 0,method,algo,k,l,r,initialization_time,algo time,seed,final
0,k-means++,naive,10,,0,0.037458,3.472321,26492.170915,7492.558746


In [14]:
gm_rdd.unpersist()

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289

## Benchmark testing

In [10]:
def analysis_init(
        data_array: npt.ArrayLike, 
        data_rdd: RDD, 
        r: int, 
        k: int, 
        iterations: int=5, 
        batch_fraction: float=0.1,
        algo: str="naive",
        ) -> npt.DTypeLike:
    results = []
    
    ######## 1 - Random Init ########
    start = time.time()
    ######## Init ########
    centroidsRandom = data_array[np.random.choice(data_array.shape[0], size=k, replace=False)]
    init_time = time.time() - start
    seed_cost = cost_function(data_rdd, centroidsRandom)

    start = time.time()
    ######## Algo ########
    if algo=="naive": final_centroids = naiveKMeans(data_rdd, centroidsRandom, iterations)
    else: final_centroids = miniBatchKMeans(data_rdd, centroidsRandom, iterations, batch_fraction)

    algo_time = time.time() - start
    final_cost = cost_function(data_rdd, final_centroids)
    
    results.append({
        "method": "random",
        "algo": algo,
        "k": k,
        "l": 0.5*k,
        "r": r, 
        "initialization_time (s)": f"{init_time:.3f}",
        "algo time (s)":f"{algo_time:.3f}",
        "seed": f"{seed_cost/1e4:.3f}",
        "final": f"{final_cost/1e4:.3f}"
    })

    ######## 2 - k-means++ ########
    start = time.time()
    ######## Init ########
    centroidsPlusPlus = kMeansPlusPlus_init(data_array, k)
    init_time = time.time() - start
    seed_cost = cost_function(data_rdd, centroidsPlusPlus)

    start = time.time()
    ######## Algo ########
    if algo=="naive": final_centroids = naiveKMeans(data_rdd, centroidsPlusPlus, iterations)
    else: final_centroids = miniBatchKMeans(data_rdd, centroidsPlusPlus, iterations, batch_fraction)
    algo_time = time.time() - start

    final_cost = cost_function(data_rdd, final_centroids)

    results.append({
        "method": "kmeans++",
        "algo": algo,
        "k": k,
        "l": 0.5*k,
        "r": r, 
        "initialization_time (s)": f"{init_time:.3f}",
        "algo time (s)":f"{algo_time:.3f}",
        "seed": f"{seed_cost/1e4:.3f}",
        "final": f"{final_cost/1e4:.3f}"
    })

    ######## 3 - Parallel l=0.5k ########
    start = time.time()
    ######## Init ########
    centroidsParallel1 = kMeansParallel_init(data_rdd, k=k, l=0.5*k, r=r)
    init_time = time.time() - start
    seed_cost = cost_function(data_rdd, centroidsParallel1)

    start = time.time()
    ######## Algo ########
    if algo=="naive": final_centroids = naiveKMeans(data_rdd, centroidsParallel1, iterations)
    else: final_centroids = miniBatchKMeans(data_rdd, centroidsParallel1, iterations, batch_fraction)
    algo_time = time.time() - start

    final_cost = cost_function(data_rdd, final_centroids)

    results.append({
        "method": "kmeans||",
        "algo": algo,
        "k": k,
        "l": 0.5*k,
        "r": r, 
        "initialization_time (s)": f"{init_time:.3f}",
        "algo time (s)":f"{algo_time:.3f}",
        "seed": f"{seed_cost/1e4:.3f}",
        "final": f"{final_cost/1e4:.3f}"
    })

    ######## 4 - Parallel l=2k ########
    start = time.time()
    ######## Init ########
    centroidsParallel2 = kMeansParallel_init(data_rdd, k=k, l=2*k, r=r)
    init_time = time.time() - start
    seed_cost = cost_function(data_rdd, centroidsParallel2)

    start = time.time()
    ######## Algo ########
    if algo=="naive": final_centroids = naiveKMeans(data_rdd, centroidsParallel2, iterations)
    else: final_centroids = miniBatchKMeans(data_rdd, centroidsParallel2, iterations, batch_fraction)
    algo_time = time.time() - start
    
    final_cost = cost_function(data_rdd, final_centroids)
    
    results.append({
        "method": "kmeans||",
        "algo": algo,
        "k": k,
        "l": 2*k,
        "r": r, 
        "initialization_time (s)": f"{init_time:.3f}",
        "algo time (s)":f"{algo_time:.3f}",
        "seed": f"{seed_cost/1e4:.3f}",
        "final": f"{final_cost/1e4:.3f}"
    })

    df = pd.DataFrame(results)
    return df

Paper analysis on Gaussian mixture

In [14]:
R = [1, 10, 100]
res = []

n = 10000
k = 50
dim = 15
r = 5
iterations = 20
batch_fraction = 0.3

for RR in R:
    for algo in ["naive", "mini-batch"]:
        data_array = gauss_mixture(n, k, dim, R=RR)
        data_rdd = sc.parallelize([row for row in data_array]).persist()

        df = analysis_init(data_array, data_rdd, r, k, iterations, batch_fraction, algo=algo)
        print(
            f"R={RR}\t", 
            f"algo={algo}", "\n","------------------------------", "\n"
        )

        res.append(df)
        data_rdd.unpersist()

25/09/10 22:26:53 WARN TaskSetManager: Stage 937 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:26:54 WARN TaskSetManager: Stage 938 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:26:57 WARN TaskSetManager: Stage 939 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:26:58 WARN TaskSetManager: Stage 941 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:26:59 WARN TaskSetManager: Stage 942 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:27:00 WARN TaskSetManager: Stage 944 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:27:01 WARN TaskSetManager: Stage 945 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.

R=1	 algo=naive 
 ------------------------------ 



25/09/10 22:31:48 WARN TaskSetManager: Stage 1093 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:31:49 WARN TaskSetManager: Stage 1094 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:31:50 WARN TaskSetManager: Stage 1095 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:31:50 WARN TaskSetManager: Stage 1097 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:31:51 WARN TaskSetManager: Stage 1098 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:31:51 WARN TaskSetManager: Stage 1100 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:31:52 WARN TaskSetManager: Stage 1101 contains a task of very large size (4508 KiB). The maximum recommended task size is 10

R=1	 algo=mini-batch 
 ------------------------------ 



25/09/10 22:36:22 WARN TaskSetManager: Stage 1249 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:36:23 WARN TaskSetManager: Stage 1250 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:36:24 WARN TaskSetManager: Stage 1251 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:36:24 WARN TaskSetManager: Stage 1253 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:36:25 WARN TaskSetManager: Stage 1254 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:36:26 WARN TaskSetManager: Stage 1256 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:36:27 WARN TaskSetManager: Stage 1257 contains a task of very large size (4508 KiB). The maximum recommended task size is 10

R=10	 algo=naive 
 ------------------------------ 



25/09/10 22:41:14 WARN TaskSetManager: Stage 1405 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:41:15 WARN TaskSetManager: Stage 1406 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:41:15 WARN TaskSetManager: Stage 1407 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:41:16 WARN TaskSetManager: Stage 1409 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:41:17 WARN TaskSetManager: Stage 1410 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:41:17 WARN TaskSetManager: Stage 1412 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:41:18 WARN TaskSetManager: Stage 1413 contains a task of very large size (4362 KiB). The maximum recommended task size is 10

R=10	 algo=mini-batch 
 ------------------------------ 



25/09/10 22:45:47 WARN TaskSetManager: Stage 1561 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:45:48 WARN TaskSetManager: Stage 1562 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:45:49 WARN TaskSetManager: Stage 1563 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:45:49 WARN TaskSetManager: Stage 1565 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:45:50 WARN TaskSetManager: Stage 1566 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:45:51 WARN TaskSetManager: Stage 1568 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:45:52 WARN TaskSetManager: Stage 1569 contains a task of very large size (4508 KiB). The maximum recommended task size is 10

R=100	 algo=naive 
 ------------------------------ 



25/09/10 22:50:41 WARN TaskSetManager: Stage 1717 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:50:42 WARN TaskSetManager: Stage 1718 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:50:43 WARN TaskSetManager: Stage 1719 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:50:43 WARN TaskSetManager: Stage 1721 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:50:44 WARN TaskSetManager: Stage 1722 contains a task of very large size (4362 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:50:44 WARN TaskSetManager: Stage 1724 contains a task of very large size (4508 KiB). The maximum recommended task size is 1000 KiB.
25/09/10 22:50:45 WARN TaskSetManager: Stage 1725 contains a task of very large size (4362 KiB). The maximum recommended task size is 10

R=100	 algo=mini-batch 
 ------------------------------ 



                                                                                

In [None]:
##### batch fraction = 0.1 #####
res_df = pd.concat(objs=res, ignore_index=True)
cols = res_df.columns
res_df["R"] = np.array([1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10, 10, 10, 100, 100, 100, 100, 100, 100, 100, 100])
columns = []
for i, col in enumerate(cols):
    if i==0: 
        columns.append("R")
    if i==1:
        columns.append(cols[i])
        columns.append(cols[i-1])
    else: columns.append(cols[i])
res_df = res_df[columns]
res_df

Unnamed: 0,R,method,algo,k,l,r,initialization_time (s),algo time (s),seed,final
0,1,random,naive,50,25.0,5,0.01,19.366,1133.762,732.724
1,1,kmeans++,naive,50,25.0,5,202.218,16.88,1137.859,729.04
2,1,kmeans||,naive,50,25.0,5,10.553,16.005,1061.794,730.955
3,1,kmeans||,naive,50,100.0,5,12.795,15.943,882.715,730.687
4,1,random,mini-batch,50,25.0,5,0.01,9.428,1133.762,1132.244
5,1,kmeans++,mini-batch,50,25.0,5,201.217,9.118,1137.859,1136.282
6,1,kmeans||,mini-batch,50,25.0,5,10.237,9.435,1046.364,1045.014
7,1,kmeans||,mini-batch,50,100.0,5,13.086,9.506,869.093,868.654
8,10,random,naive,50,25.0,5,0.01,15.936,3124.08,1448.975
9,10,kmeans++,naive,50,25.0,5,201.472,15.24,2149.988,1034.647


In [18]:
##### batch fraction = 0.3 #####
res_df2 = pd.concat(objs=res, ignore_index=True)
cols = res_df2.columns
res_df2["R"] = np.array([1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10, 10, 10, 100, 100, 100, 100, 100, 100, 100, 100])
columns = []
for i, col in enumerate(cols):
    if i==0: 
        columns.append("R")
    if i==1:
        columns.append(cols[1])
        columns.append(cols[0])
    else: columns.append(cols[i])
res_df = res_df2[columns]
res_df

Unnamed: 0,R,method,algo,method.1,k,l,r,initialization_time (s),algo time (s),seed,final
0,1,random,naive,random,50,25.0,5,0.057,17.877,1133.762,732.724
1,1,kmeans++,naive,kmeans++,50,25.0,5,200.029,15.276,1137.859,729.04
2,1,kmeans||,naive,kmeans||,50,25.0,5,9.953,15.649,1060.803,730.417
3,1,kmeans||,naive,kmeans||,50,100.0,5,12.884,15.342,881.527,723.254
4,1,random,mini-batch,random,50,25.0,5,0.008,10.398,1133.762,1133.256
5,1,kmeans++,mini-batch,kmeans++,50,25.0,5,201.456,10.448,1137.859,1137.335
6,1,kmeans||,mini-batch,kmeans||,50,25.0,5,10.407,10.634,1007.902,1007.512
7,1,kmeans||,mini-batch,kmeans||,50,100.0,5,12.223,10.648,878.869,878.695
8,10,random,naive,random,50,25.0,5,0.008,15.031,3124.08,1448.975
9,10,kmeans++,naive,kmeans++,50,25.0,5,200.979,15.146,2149.988,1034.647


In [19]:
# stopping the cluster
! $SPARK_HOME/sbin/stop-all.sh

worker2: stopping org.apache.spark.deploy.worker.Worker
worker3: stopping org.apache.spark.deploy.worker.Worker
worker1: stopping org.apache.spark.deploy.worker.Worker


25/09/10 23:00:25 ERROR TaskSchedulerImpl: Lost executor 0 on 10.67.22.202: Command exited with code 143
25/09/10 23:00:25 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_12 !
25/09/10 23:00:25 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_8 !
25/09/10 23:00:25 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_4 !
25/09/10 23:00:25 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_0 !
25/09/10 23:00:25 ERROR TaskSchedulerImpl: Lost executor 3 on 10.67.22.208: Command exited with code 143
25/09/10 23:00:25 ERROR TaskSchedulerImpl: Lost executor 2 on 10.67.22.170: Command exited with code 143
25/09/10 23:00:25 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_14 !
25/09/10 23:00:25 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_6 !
25/09/10 23:00:25 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_2 !
25/09/10 23:00:25 WARN BlockManagerMast

master: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master


25/09/10 23:00:26 WARN StandaloneAppClient$ClientEndpoint: Connection to 10.67.22.224:7077 failed; waiting for master to reconnect...
25/09/10 23:00:26 WARN StandaloneSchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...
25/09/10 23:00:31 ERROR TaskSchedulerImpl: Lost executor 1 on 10.67.22.224: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
25/09/10 23:00:31 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_1 !
25/09/10 23:00:31 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_9 !
25/09/10 23:00:31 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_5 !
25/09/10 23:00:31 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_0_13 !
