In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import split
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col
import time
import numpy as np

spark = SparkSession.builder.appName("DecisionTree").getOrCreate()
kdd = spark.read.csv("data/kdd.data")
print(kdd)
print(kdd.show())



DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string, _c23: string, _c24: string, _c25: string, _c26: string, _c27: string, _c28: string, _c29: string, _c30: string, _c31: string, _c32: string, _c33: string, _c34: string, _c35: string, _c36: string, _c37: string, _c38: string, _c39: string, _c40: string, _c41: string]
+---+---+---+---+---+-----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1|_c2|_c3|_c4|  _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41

In [3]:
# Set the number of runs
seeds = [123,1234,777,888,1000,9876545,123456,3333,88,1000]

# Initialize lists to store accuracy results
train_accuracies = []
test_accuracies = []
run = 1



In [40]:
feature_columns = kdd.columns[:-1]  # Select all columns except the last one
label_column = kdd.columns[-1] 


# Create a StringIndexer to encode the label column
label_indexer = StringIndexer(inputCol=label_column, outputCol="indexed_lab")
data = label_indexer.fit(kdd).transform(kdd)


# # List of columns to convert
columns_to_convert = ["_c0","_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9", "_c10",
                      "_c11", "_c12", "_c13", "_c14", "_c15", "_c16", "_c17", "_c18", "_c19", "_c20",
                      "_c21", "_c22", "_c23", "_c24", "_c25", "_c26", "_c27", "_c28", "_c29", "_c30",
                      "_c31", "_c32", "_c33", "_c34", "_c35", "_c36", "_c37", "_c38", "_c39", "_c40"]

# Convert columns to numerical types
for column in columns_to_convert:
    data = data.withColumn(column, col(column).cast("double"))
# Create a vector assembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
kdd_vec = assembler.transform(data)
kdd_vec.select("features").show(truncate=False)


+------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                        |
+------------------------------------------------------------------------------------------------------------------------------------------------+
|(41,[2,3,4,22,23,28,31,32,33,34,35,39],[17.0,9.0,491.0,2.0,2.0,1.0,150.0,25.0,0.17,0.03,0.17,0.05])                                             |
|(41,[1,2,3,4,22,23,28,29,31,32,34,35],[1.0,42.0,9.0,146.0,13.0,1.0,0.08,0.15,255.0,1.0,0.6,0.88])                                               |
|(41,[2,3,22,23,24,25,28,29,31,32,33,34,37,38],[47.0,5.0,123.0,6.0,1.0,1.0,0.05,0.07,255.0,26.0,0.1,0.05,1.0,1.0])                               |
|(41,[2,3,4,5,11,22,23,24,25,28,31,32,33,35,36,37,38,40],[21.0,9.0,232.0,8153.0,1.0,5.0,5.0,0.2,0.2,1.0,30.0,255.0,1.0

In [41]:
def dt_main(seed,run):
    (training_data, test_data) = kdd_vec.randomSplit([0.7, 0.3],seed=seed)
#     trainingData.show()
#     testData.show()
    print(run ," - seed =", seed)
    start_time = time.time()
    dt = DecisionTreeClassifier(labelCol="indexed_lab", featuresCol="features")
    dtModel = dt.fit(training_data)

    train_pred = dtModel.transform(training_data)
    test_pred = dtModel.transform(test_data)



    # Evaluation
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexed_lab", predictionCol="prediction", metricName="accuracy"
    )
    train_accuracy = evaluator.evaluate(train_pred)
    test_accuracy =  evaluator.evaluate(test_pred)

    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)

    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

    running_time = time.time() - start_time
    print("Running Time:", running_time, "seconds")
    run = run+1
    return run



 

In [42]:
for seed in seeds:
    
    run = dt_main(seed, run)
print("Training Accuracy - Max:", np.max(train_accuracies))
print("Training Accuracy - Min:", np.min(train_accuracies))
print("Training Accuracy - Average:", np.mean(train_accuracies))
print("Training Accuracy - Standard Deviation:", np.std(train_accuracies))
print("Test Accuracy - Max:", np.max(test_accuracies))
print("Test Accuracy - Min:", np.min(test_accuracies))
print("Test Accuracy - Average:", np.mean(test_accuracies))
print("Test Accuracy - Standard Deviation:", np.std(test_accuracies))


1  - seed = 123
Training Accuracy: 0.954793944491169
Test Accuracy: 0.9533370257546386
Running Time: 4.997714519500732 seconds
2  - seed = 1234
Training Accuracy: 0.9518729397662571
Test Accuracy: 0.9514130586106084
Running Time: 4.819903612136841 seconds
3  - seed = 777
Training Accuracy: 0.9546314935552684
Test Accuracy: 0.952259046564727
Running Time: 5.570835828781128 seconds
4  - seed = 888
Training Accuracy: 0.947838064438855
Test Accuracy: 0.9463040446304045
Running Time: 5.8547327518463135 seconds
5  - seed = 1000
Training Accuracy: 0.9525559439514804
Test Accuracy: 0.9506484402383456
Running Time: 7.110910177230835 seconds
6  - seed = 9876545
Training Accuracy: 0.9459111590038314
Test Accuracy: 0.9468872138470128
Running Time: 6.441486835479736 seconds
7  - seed = 123456
Training Accuracy: 0.9504406273338312
Test Accuracy: 0.953719935488395
Running Time: 6.048511028289795 seconds
8  - seed = 3333
Training Accuracy: 0.9553424739373301
Test Accuracy: 0.9511887229118452
Running T

In [35]:

    spark.stop()