In [7]:
# Import library
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_kddcup99
from pyspark.sql import SparkSession
from time import time, sleep
import subprocess
import pprint
import sys
import logging
import warnings
from internalLibrary.helperFunctions import *
from internalLibrary.local import *
from internalLibrary.parallel import *

In [8]:
# Setup the spark warnings
warnings.filterwarnings("ignore") 
logging.getLogger('py4j').setLevel(logging.ERROR) 
logging.getLogger('pyspark').setLevel(logging.ERROR) 
log4j_conf_path = "../Settings/log4j.properties"

Global hyperparameters and data paths.

In [9]:
# Paths to files for reading and writing of data dictionaries
pickle_fileP = 'dataP/log1P_noPersist.pkl' # Parallel initialization data
pickle_fileR = 'dataR/log1U_noPersist.pkl' # Random initialization data

# Setting seeds for reproducibility
np.random.seed(12345)
spark_seed = 54321

# Number of partitions 
nSlices = [8]

# Size of considered subset
subLen = 40_000

# Maximum number of iterations in Lloyds algorithm
lloydsMaxIterations=20

In [11]:
%%time
### SPARK SETUP ###

# Build a spark session
spark = SparkSession.builder \
    .master("spark://spark-master:7077")\
    .appName("Clustering")\
    .config("spark.executor.memory", "7g")\
    .config("spark.driver.extraJavaOptions", f"-Dlog4j.configuration=file:{log4j_conf_path}")\
    .getOrCreate()

# Create a spark context
sc = spark.sparkContext
sc.setLogLevel("ERROR")

# Eventually clear old data (if re-running)
spark.catalog.clearCache() 
# for (id, rdd) in sc._jsc.getPersistentRDDs().items():
#     rdd.unpersist()

#### IMPORT THE DATA SET ####
data = fetch_kddcup99(return_X_y = True, percent10 = True) # default percent10=True

# collect samples and features (target)
x = data[0]
y = data[1] 

# Shuffle
shuffled_indices = np.random.permutation(len(x))
x=x[shuffled_indices]
y=y[shuffled_indices]

# cut the data fro memory reasons
x = x[:subLen,]
y = y[:subLen]

for nSlice in nSlices:
    ### PARALLEL ###

    # Open file if exists
    sleep(1)
    if os.path.isfile(pickle_fileP):
        with open(pickle_fileP, "rb") as f:
            logParallel = pickle.load(f)
            totalLogParallelInit, totalLogParallelKmeans, tDurationsParallel, tPreOperationsParallel = logParallel.values()
    else:
        totalLogParallelInit = {}
        totalLogParallelKmeans = {}
        tDurationsParallel = {}
        tPreOperationsParallel = {}

    # Start the algorithm
    tInit = time() # compute the time of the beginning of the iteration over the number of partitions
    print(f"The iteration with {nSlice} number of partition started at time {tInit}")
    
    # Parallelize over nSlice partitions
    Rdd = sc.parallelize([(None, {"x": x[i],"y": y[i], "d2":None}) for i in range(len(y))], numSlices = nSlice)

    # Cut the categorical attributes
    Rdd = Rdd.map(deleteBytes)#.persist()

    # Setting the theoretical number of clusters
    kTrue = Rdd.map(lambda datum: datum[1]["y"])\
               .distinct()\
               .count()
    
    # Rescale the RDD over the max
    maxS = Rdd.map(lambda datum: datum[1]["x"])\
           .reduce(lambda a, b: np.maximum(a, b))
    minS = Rdd.map(lambda datum: datum[1]["x"])\
           .reduce(lambda a, b: np.minimum(a, b))

    Rdd = Rdd.map(lambda datum: minmaxRescale(datum, minS, maxS))#.persist()
    
    # Setting up the input and output information for the algorithm
    logParallelInit = {}
    logParallelKmeans = {}

    # Setup k and l
    k=kTrue
    l=k*2 
    
    tInitI = time()

    tPreOperation = tInitI - tInit
    print(f"Finished the pre-steps after {tPreOperation} seconds")
          
    # Initialization kMeans //
    C_init = parallelInit(Rdd, k, l, logParallelInit)
    
    tInitialization = time() - tInitI
    print(f"Finished the initialization after {tInitialization} seconds")
    
    # Run the k-means alghoritm
    C = kMeans(Rdd, C_init, lloydsMaxIterations, logParallelKmeans)
    
    # Time information
    tEnd = time() # compute the time of the end of the iteration over the number of partitions
    tDuration = tEnd - tInit
    
    print(f"The iteration with {nSlice} number of partition ended at time {tEnd} after {tDuration} seconds")

    # Output in the correct memory adresses
    totalLogParallelInit[f"Number of partition" + str(nSlice)] = logParallelInit
    totalLogParallelKmeans[f"Number of partition" + str(nSlice)] = logParallelKmeans
    tDurationsParallel[f"Number of partition" + str(nSlice)] = tDuration
    tPreOperationsParallel[f"Number of partition" + str(nSlice)] = tPreOperation

    #Rdd.unpersist()

    spark.catalog.clearCache() 
    # for (id, rdd) in sc._jsc.getPersistentRDDs().items():
    #     rdd.unpersist()
    # print("Persisted RRDs: ", len(sc._jsc.getPersistentRDDs().items()))


    # Compute the total log
    logParallel = {"totalLogParallelInit": totalLogParallelInit, "totalLogParallelKmeans": totalLogParallelKmeans, "tDurationsParallel": tDurationsParallel, "tPreOperationsParallel": tPreOperationsParallel}
    
    # Save the log file
    if not os.path.exists('dataP'): # create a directory if it doesnt exist
        os.makedirs('dataP')
    
    with open(pickle_fileP, "wb") as file:
        pickle.dump(logParallel, file)

    # Clear the space
    subprocess.run("ssh slave2 'cd /usr/local/spark/work/ && [ \"$(ls -A .)\" ] && rm -r ./*'", shell=True)
    subprocess.run("ssh slave3 'cd /usr/local/spark/work/ && [ \"$(ls -A .)\" ] && rm -r ./*'", shell=True)


    ### NAIVE INIT ###
    
    # Load log if it exists
    sleep(1)
    if os.path.isfile(pickle_fileR):
        with open(pickle_fileR, "rb") as f:
            logNaive = pickle.load(f)
            totalLogNaiveInit, totalLogNaiveKmeans, tDurationsNaive, tPreOperationsNaive = logNaive.values()
    else:
        totalLogNaiveInit = {}
        totalLogNaiveKmeans = {}
        tDurationsNaive = {}
        tPreOperationsNaive = {}
    
    # Start algo
    tInit = time() # compute the time of the beginning of the iteration over the number of partitions
    print(f"The iteration with {nSlice} number of partition started at time {tInit}")
    
    # Parallelize over nSlice partitions
    Rdd = sc.parallelize([(None, {"x": x[i],"y": y[i], "d2":None}) for i in range(len(y))], numSlices = nSlice)

    # Cut the categorical attributes
    Rdd = Rdd.map(deleteBytes)#.persist()

    # Setting the theoretical number of clusters
    kTrue = Rdd.map(lambda datum: datum[1]["y"])\
               .distinct()\
               .count()
    
    # Rescale the RDD over the max
    maxS = Rdd.map(lambda datum: datum[1]["x"])\
           .reduce(lambda a, b: np.maximum(a, b))
    minS = Rdd.map(lambda datum: datum[1]["x"])\
           .reduce(lambda a, b: np.minimum(a, b))

    Rdd = Rdd.map(lambda datum: minmaxRescale(datum, minS, maxS))#.persist()
    
    # Setting up the input and output information for the algorithm
    logNaiveInit = {}
    logNaiveKmeans = {}

    # Setup k and l
    k=kTrue
    l=k*2 
    
    tInitI = time()

    tPreOperation = tInitI - tInit
    print(f"Finished the pre-steps after {tPreOperation} seconds")
          
    # initialization kMeans//
    C_init = naiveInitFromSet(Rdd, k, logNaiveInit)
    
    tInitialization = time() - tInitI
    print(f"Finished the initialization after {tInitialization} seconds")
    
    # Run the k-means algorithm
    C = kMeans(Rdd, C_init, lloydsMaxIterations, logNaiveKmeans)
    
    # Time information
    tEnd = time() # compute the time of the end of the iteration over the number of partitions
    tDuration = tEnd - tInit
    
    print(f"The iteration with {nSlice} number of partition ended at time {tEnd} after {tDuration} seconds")

    # Output in the correct memory adresses
    totalLogNaiveInit[f"Number of partition" + str(nSlice)] = logNaiveInit
    totalLogNaiveKmeans[f"Number of partition" + str(nSlice)] = logNaiveKmeans
    tDurationsNaive[f"Number of partition" + str(nSlice)] = tDuration
    tPreOperationsNaive[f"Number of partition" + str(nSlice)] = tPreOperation

    #Rdd.unpersist()

    spark.catalog.clearCache() 
    # for (id, rdd) in sc._jsc.getPersistentRDDs().items():
    #     rdd.unpersist()
    # print("Persisted RRDs: ", len(sc._jsc.getPersistentRDDs().items()))

    # Compute the total log
    logNaive = {"totalLogNaiveInit": totalLogNaiveInit, "totalLogNaiveKmeans": totalLogNaiveKmeans, "tDurationsNaive": tDurationsNaive, "tPreOperationsNaive": tPreOperationsNaive}
    
    # Save the log file
    if not os.path.exists('dataR'): # create a directory if it doesnt exist
        os.makedirs('dataR')
    
    with open(pickle_fileR, "wb") as filer:
        pickle.dump(logNaive, filer)

    # Clear the space
    subprocess.run("ssh slave2 'cd /usr/local/spark/work/ && [ \"$(ls -A .)\" ] && rm -r ./*'", shell=True)
    subprocess.run("ssh slave3 'cd /usr/local/spark/work/ && [ \"$(ls -A .)\" ] && rm -r ./*'", shell=True)


24/07/09 09:22:30 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


The iteration with 8 number of partition started at time 1720516953.3755696


                                                                                

Finished the pre-steps after 6.812000274658203 seconds


                                                                                

Finished the initialization after 944.7888958454132 seconds


                                                                                

The iteration with 8 number of partition ended at time 1720517918.7144194 after 965.3388497829437 seconds
CPU times: user 4.1 s, sys: 799 ms, total: 4.9 s
Wall time: 16min 9s


'\n    ### NAIVE INIT ###\n    \n    # Load log if it exists\n    sleep(1)\n    if os.path.isfile(pickle_fileR):\n        with open(pickle_fileR, "rb") as f:\n            logNaive = pickle.load(f)\n            totalLogNaiveInit, totalLogNaiveKmeans, tDurationsNaive, tPreOperationsNaive = logNaive.values()\n    else:\n        totalLogNaiveInit = {}\n        totalLogNaiveKmeans = {}\n        tDurationsNaive = {}\n        tPreOperationsNaive = {}\n    \n    # Start algo\n    tInit = time() # compute the time of the beginning of the iteration over the number of partitions\n    print(f"The iteration with {nSlice} number of partition started at time {tInit}")\n    \n    # Parallelize over nSlice partitions\n    Rdd = sc.parallelize([(None, {"x": x[i],"y": y[i], "d2":None}) for i in range(len(y))], numSlices = nSlice)\n\n    # Cut the categorical attributes\n    Rdd = Rdd.map(deleteBytes)#.persist()\n\n    # Setting the theoretical number of clusters\n    kTrue = Rdd.map(lambda datum: datum[1

In [11]:
# Kill spark and the context
sc.stop()
#spark.stop()