In [12]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import split
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col
import time
import numpy as np

spark = SparkSession.builder.master("local").appName("DecisionTree").getOrCreate()
kdd = spark.read.csv("data/kdd.data")
print(kdd)
print(kdd.show())



DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string, _c23: string, _c24: string, _c25: string, _c26: string, _c27: string, _c28: string, _c29: string, _c30: string, _c31: string, _c32: string, _c33: string, _c34: string, _c35: string, _c36: string, _c37: string, _c38: string, _c39: string, _c40: string, _c41: string]
+---+---+---+---+---+-----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1|_c2|_c3|_c4|  _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41

In [13]:
# Set the number of runs
seeds = [123,1234,777,888,1000,9876545,123456,3333,88,1000]

# Initialize lists to store accuracy results
train_accuracies = []
test_accuracies = []
run = 1



In [14]:
feature_columns = kdd.columns[:-1]  # Select all columns except the last one
label_column = kdd.columns[-1] 


# Create a StringIndexer to encode the label column
label_indexer = StringIndexer(inputCol=label_column, outputCol="indexed_lab")
data = label_indexer.fit(kdd).transform(kdd)


# # List of columns to convert
columns_to_convert = ["_c0","_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9", "_c10",
                      "_c11", "_c12", "_c13", "_c14", "_c15", "_c16", "_c17", "_c18", "_c19", "_c20",
                      "_c21", "_c22", "_c23", "_c24", "_c25", "_c26", "_c27", "_c28", "_c29", "_c30",
                      "_c31", "_c32", "_c33", "_c34", "_c35", "_c36", "_c37", "_c38", "_c39", "_c40"]

# Convert columns to numerical types
for column in columns_to_convert:
    data = data.withColumn(column, col(column).cast("double"))
# Create a vector assembler to combine the feature columns into a single vector column

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
kdd_vec = assembler.transform(data)
# Standardize
scaler = StandardScaler(inputCol="features", outputCol="sfeatures")
scaler_model = scaler.fit(kdd_vec)
kdd_vec = scaler_model.transform(kdd_vec)

kdd_vec.select("sfeatures").show(truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|sfeatures                                                                                                                                                                                                                                                                                                                                                                                                            |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
def lr_main(seed,run):
    """This function splits prepared data for training and test based on given seed, fits test data to Logistic Regression model, make predictions on test data, evaluate both
  training and test accuracies and collect them to lists. Prints single run stats.
  Parameters:
    seed(str) - splitting seed from list
    run(int) - count increasing after each run
  Return:
    run(int) -count"""
    (training_data, test_data) = kdd_vec.randomSplit([0.7, 0.3],seed=seed)
#     trainingData.show()
#     testData.show()
    print(run ," - seed =", seed)
    start_time = time.time()
    dt = LogisticRegression(labelCol="indexed_lab", featuresCol="sfeatures")
    dtModel = dt.fit(training_data)

    train_pred = dtModel.transform(training_data)
    test_pred = dtModel.transform(test_data)



    # Evaluation
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexed_lab", predictionCol="prediction", metricName="accuracy"
    )
    train_accuracy = evaluator.evaluate(train_pred)
    test_accuracy =  evaluator.evaluate(test_pred)

    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)

    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

    running_time = time.time() - start_time
    print("Running Time:", running_time, "seconds")
    run = run+1
    return run



 

In [16]:
for seed in seeds:
    
    run = lr_main(seed, run)
print("Training Accuracy - Max:", np.max(train_accuracies))
print("Training Accuracy - Min:", np.min(train_accuracies))
print("Training Accuracy - Average:", np.mean(train_accuracies))
print("Training Accuracy - Standard Deviation:", np.std(train_accuracies))
print("Test Accuracy - Max:", np.max(test_accuracies))
print("Test Accuracy - Min:", np.min(test_accuracies))
print("Test Accuracy - Average:", np.mean(test_accuracies))
print("Test Accuracy - Standard Deviation:", np.std(test_accuracies))


1  - seed = 123
Training Accuracy: 0.8887954340642835
Test Accuracy: 0.8824588121279247
Running Time: 10.112479209899902 seconds
2  - seed = 1234
Training Accuracy: 0.8891149542217701
Test Accuracy: 0.8908760653905268
Running Time: 6.614961862564087 seconds
3  - seed = 777
Training Accuracy: 0.8907739344164018
Test Accuracy: 0.8898010296368443
Running Time: 6.446126699447632 seconds
4  - seed = 888
Training Accuracy: 0.8889818105791344
Test Accuracy: 0.8900035075412136
Running Time: 6.445791006088257 seconds
5  - seed = 1000
Training Accuracy: 0.8887329411413384
Test Accuracy: 0.8859569092567899
Running Time: 6.448025226593018 seconds
6  - seed = 9876545
Training Accuracy: 0.8908759233230659
Test Accuracy: 0.8889277470798069
Running Time: 6.4101996421813965 seconds
7  - seed = 123456
Training Accuracy: 0.8901082471144071
Test Accuracy: 0.886106058486078
Running Time: 6.570448637008667 seconds
8  - seed = 3333
Training Accuracy: 0.8887328992173965
Test Accuracy: 0.8900967877682704
Runni

In [35]:

    spark.stop()