# Model Runs for Cybersecurity Incident Prediction Report

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second, dayofweek, weekofyear, col, mean, min, max, count
from pyspark.ml import Transformer, Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler, RobustScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes, DecisionTreeClassifier, LogisticRegression, RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [3]:
df_train = spark.read.csv('/content/gdrive/MyDrive/GUIDE_Train.csv', header=True, inferSchema = True)
df_test = spark.read.csv('/content/gdrive/MyDrive/GUIDE_Test.csv', header=True, inferSchema = True)

df_train = df_train.drop('MitreTechniques', 'ActionGrouped', 'ActionGranular', 'EmailClusterId', 'ThreatFamily', 'ResourceType', 'Roles', 'AntispamDirection', 'SuspicionLevel', 'LastVerdict').dropna(subset=['IncidentGrade'])
df_test = df_test.drop('MitreTechniques', 'ActionGrouped', 'ActionGranular', 'EmailClusterId', 'ThreatFamily', 'ResourceType', 'Roles', 'AntispamDirection', 'SuspicionLevel', 'LastVerdict', 'Usage')

In [None]:
df_train.show(5)

+-------------+-----+----------+-------+-------------------+----------+----------+-----------------+--------------+----------+------------+--------+------+---------+------+----------+----------+---------------+-----------+----------+----------------+-----------+-----------------+-----------------+-------------+---------------+------------------+--------+----------+--------------+--------+---------+-----------+-----+-----+
|           Id|OrgId|IncidentId|AlertId|          Timestamp|DetectorId|AlertTitle|         Category| IncidentGrade|EntityType|EvidenceRole|DeviceId|Sha256|IpAddress|   Url|AccountSid|AccountUpn|AccountObjectId|AccountName|DeviceName|NetworkMessageId|RegistryKey|RegistryValueName|RegistryValueData|ApplicationId|ApplicationName|OAuthApplicationId|FileName|FolderPath|ResourceIdName|OSFamily|OSVersion|CountryCode|State| City|
+-------------+-----+----------+-------+-------------------+----------+----------+-----------------+--------------+----------+------------+--------+

In [None]:
df_test.show(5)

+-------------+-----+----------+-------+-------------------+----------+----------+-----------------+--------------+-----------------+------------+--------+------+---------+------+----------+----------+---------------+-----------+----------+----------------+-----------+-----------------+-----------------+-------------+---------------+------------------+--------+----------+--------------+--------+---------+-----------+-----+-----+
|           Id|OrgId|IncidentId|AlertId|          Timestamp|DetectorId|AlertTitle|         Category| IncidentGrade|       EntityType|EvidenceRole|DeviceId|Sha256|IpAddress|   Url|AccountSid|AccountUpn|AccountObjectId|AccountName|DeviceName|NetworkMessageId|RegistryKey|RegistryValueName|RegistryValueData|ApplicationId|ApplicationName|OAuthApplicationId|FileName|FolderPath|ResourceIdName|OSFamily|OSVersion|CountryCode|State| City|
+-------------+-----+----------+-------+-------------------+----------+----------+-----------------+--------------+-----------------+-

In [4]:
class TimestampFeatureTransformer(Transformer):
    def __init__(self):
        super(TimestampFeatureTransformer, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        return df.withColumn("year", year(df["Timestamp"])) \
                 .withColumn("month", month(df["Timestamp"])) \
                 .withColumn("day", dayofmonth(df["Timestamp"])) \
                 .withColumn("hour", hour(df["Timestamp"])) \
                 .withColumn("minute", minute(df["Timestamp"])) \
                 .withColumn("second", second(df["Timestamp"])) \
                 .withColumn("day_of_week", dayofweek(df["Timestamp"])) \
                 .withColumn("week_of_year", weekofyear(df["Timestamp"]))

timestamp_transformer = TimestampFeatureTransformer()

# Naive Bayes Pipeline and Model Training

## Pipeline

In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

pipe_stages.append(OneHotEncoder(inputCols=[c+'_index' for c in ['Category', 'EntityType', 'EvidenceRole']], outputCols=[c+'_ohe' for c in ['Category', 'EntityType', 'EvidenceRole']]))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_ohe', 'EntityType_ohe', 'EvidenceRole_ohe',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
pipe_stages.append(assembler)

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages)

pipeline_model = pipeline.fit(df_train)

df_train_transformed = pipeline_model.transform(df_train)
df_test_transformed = pipeline_model.transform(df_test)

In [None]:
df_train_transformed.select(['IncidentGrade', 'target']).show(5)

+--------------+------+
| IncidentGrade|target|
+--------------+------+
|  TruePositive|   1.0|
| FalsePositive|   2.0|
| FalsePositive|   2.0|
|BenignPositive|   0.0|
|  TruePositive|   1.0|
+--------------+------+
only showing top 5 rows



BenignPositive   0.0
FalsePositive    2.0
TruePositive     1.0

## Model Training

In [None]:
nb = NaiveBayes(featuresCol='features', labelCol='target')
nb_model = nb.fit(df_train_transformed)

train_prediction = nb_model.transform(df_train_transformed)
test_prediction = nb_model.transform(df_test_transformed)

## Evaluations

In [None]:
evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

### Train Results

In [None]:
macrof1_class_0 = evaluator_macrof1.evaluate(train_prediction, {evaluator_macrof1.metricLabel: 0})
print(f"Macro F1 for class 0: {macrof1_class_0:.4f}")

macrof1_class_1 = evaluator_macrof1.evaluate(train_prediction, {evaluator_macrof1.metricLabel: 1})
print(f"Macro F1 for class 1: {macrof1_class_1:.4f}")

macrof1_class_2 = evaluator_macrof1.evaluate(train_prediction, {evaluator_macrof1.metricLabel: 2})
print(f"Macro F1 for class 2: {macrof1_class_2:.4f}")

Macro F1 for class 0: 0.4651
Macro F1 for class 1: 0.4651
Macro F1 for class 2: 0.4651


In [None]:
f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")

f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")

f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

F1 for class 0: 0.5110
F1 for class 1: 0.4992
F1 for class 2: 0.3166


In [None]:
precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")

precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")

precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")

Precision for class 0: 0.5077
Precision for class 1: 0.5815
Precision for class 2: 0.2731


In [None]:
recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")

recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")

recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

Recall for class 0: 0.5144
Recall for class 1: 0.4373
Recall for class 2: 0.3765


To get the individual values for each class I will be doing this manually.
Note the mapping:
- BenignPositive   0.0
- TruePositive     1.0
- FalsePositive    2.0

I've reworked the index to go in the order of 0, 1 and 2 as normal.

In [None]:
# Confusion Matrix for Train Set
train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

+-----------------+-------+-------+-------+
|target_prediction|    0.0|    1.0|    2.0|
+-----------------+-------+-------+-------+
|              1.0|1175786|1453164| 693763|
|              0.0|2114411| 653933|1342473|
|              2.0| 874822| 392030| 765115|
+-----------------+-------+-------+-------+



In [None]:
conf_mat = [[2114411, 653933, 1342473], [1175786, 1453164, 693763], [874822, 392030, 765115]]

for i in range(3):
    tp = conf_mat[i][i]
    fp = sum(conf_mat[j][i] for j in range(3) if j != i)
    fn = sum(conf_mat[i][j] for j in range(3) if j != i)
    tn = sum(conf_mat[j][k] for j in range(3) for k in range(3) if j != i and k != i)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    print(f"Class {i}:")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1_score}")
    print()

Class 0:
Precision: 0.5076593888287184
Recall: 0.5143529862798563
F1 Score: 0.5109842679337774

Class 1:
Precision: 0.5814686488521792
Recall: 0.43734261731302104
F1 Score: 0.4992112459291221

Class 2:
Precision: 0.273123575017911
Recall: 0.3765390874950233
F1 Score: 0.316600314732033



### Test Results

In [None]:
macrof1_class_0 = evaluator_macrof1.evaluate(test_prediction, {evaluator_macrof1.metricLabel: 0})
print(f"Macro F1 for class 0: {macrof1_class_0:.4f}")

macrof1_class_1 = evaluator_macrof1.evaluate(test_prediction, {evaluator_macrof1.metricLabel: 1})
print(f"Macro F1 for class 1: {macrof1_class_1:.4f}")

macrof1_class_2 = evaluator_macrof1.evaluate(test_prediction, {evaluator_macrof1.metricLabel: 2})
print(f"Macro F1 for class 2: {macrof1_class_2:.4f}")

# verify results: while they are the same this may be a weighted average

Macro F1 for class 0: 0.4697
Macro F1 for class 1: 0.4697
Macro F1 for class 2: 0.4697


In [None]:
f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")

f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")

f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

F1 for class 0: 0.4978
F1 for class 1: 0.5244
F1 for class 2: 0.3246


In [None]:
precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")

precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")

precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")

Precision for class 0: 0.4898
Precision for class 1: 0.6103
Precision for class 2: 0.2831


In [None]:
recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")

recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")

recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

Recall for class 0: 0.5061
Recall for class 1: 0.4597
Recall for class 2: 0.3803


In [None]:
# Confusion Matrix for Test Set
test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

+-----------------+------+------+------+
|target_prediction|   0.0|   1.0|   2.0|
+-----------------+------+------+------+
|              1.0|527797|686085|278472|
|              0.0|887183|275040|590717|
|              2.0|396358|163056|343284|
+-----------------+------+------+------+



In [None]:
conf_mat = [[887183, 275040, 590717], [527797, 686085, 278472], [396358, 163056, 343284]]

for i in range(3):
    tp = conf_mat[i][i]
    fp = sum(conf_mat[j][i] for j in range(3) if j != i)
    fn = sum(conf_mat[i][j] for j in range(3) if j != i)
    tn = sum(conf_mat[j][k] for j in range(3) for k in range(3) if j != i and k != i)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    print(f"Class {i}:")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1_score}")
    print()

Class 0:
Precision: 0.4897942846669147
Recall: 0.5061114470546625
F1 Score: 0.49781919367681204

Class 1:
Precision: 0.6102976300079791
Recall: 0.45973341445796373
F1 Score: 0.5244225664858295

Class 2:
Precision: 0.28312712942886153
Recall: 0.3802866517927369
F1 Score: 0.32459219609194717



# Decision Tree Classifier Model

## Pipeline

In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_index', 'EntityType_index', 'EvidenceRole_index',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
pipe_stages.append(assembler)

dt = DecisionTreeClassifier(labelCol="target", featuresCol="features", maxBins=33)

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [dt])

## Model Training

#### With Grid Search Cross Validation

In [None]:
param_grid = (ParamGridBuilder()
              .addGrid(dt.maxDepth, [5, 10])
              # .addGrid(dt.maxBins, [33, 64]) # 33 because min maxBin must be largest categorical
              .addGrid(dt.minInstancesPerNode, [1, 5])
              .build())

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="target", metricName="f1")

cross_validator = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3
)

cv_model = cross_validator.fit(df_train)

KeyboardInterrupt: 

In [None]:
best_dt = cv_model.bestModel.stages[-1]

print("Best maxDepth:", best_dt.getMaxDepth())
print("Best maxBins:", best_dt.getMaxBins())
print("Best minInstancesPerNode:", best_dt.getMinInstancesPerNode())

In [None]:
train_prediction = cv_model.bestModel.transform(df_train)
test_prediction = cv_model.bestModel.transform(df_test)

NameError: name 'cv_model' is not defined

Due to Google Colab limits this is constantly crashing the notebook. I elect to conduct manual tuning instead

#### Without Grid Search CV

In [None]:
model = pipeline.fit(df_train)

In [None]:
train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

## Evaluations

In [None]:
evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [None]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

Train Results:
Macro F1: 0.6510
F1 for class 0: 0.7304
F1 for class 1: 0.6796
F1 for class 2: 0.4434
Precision for class 0: 0.5894
Precision for class 1: 0.9440
Precision for class 2: 0.7224
Recall for class 0: 0.9602
Recall for class 1: 0.5309
Recall for class 2: 0.3199

+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0|1436016|1764142|122555|
|              0.0|3947087|  36551|127179|
|              2.0|1313769|  68190|650008|
+-----------------+-------+-------+------+



In [None]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Test Results:
Macro F1: 0.6663
F1 for class 0: 0.7264
F1 for class 1: 0.7094
F1 for class 2: 0.4782
Precision for class 0: 0.5874
Precision for class 1: 0.9573
Precision for class 2: 0.7422
Recall for class 0: 0.9518
Recall for class 1: 0.5635
Recall for class 2: 0.3527

+-----------------+-------+------+------+
|target_prediction|    0.0|   1.0|   2.0|
+-----------------+-------+------+------+
|              1.0| 605489|840936| 45929|
|              0.0|1668452| 19838| 64650|
|              2.0| 566583| 17700|318415|
+-----------------+-------+------+------+



# Deeper DT2

Trying a deeper Decision Tree since the last was so successful.

In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_index', 'EntityType_index', 'EvidenceRole_index',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
pipe_stages.append(assembler)

dt = DecisionTreeClassifier(labelCol="target", featuresCol="features", maxDepth=10, maxBins=33)

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [dt])

model = pipeline.fit(df_train)

train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [None]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

Train Results:
Macro F1: 0.7790
F1 for class 0: 0.8085
F1 for class 1: 0.8099
F1 for class 2: 0.6687
Precision for class 0: 0.7088
Precision for class 1: 0.8861
Precision for class 2: 0.8945
Recall for class 0: 0.9408
Recall for class 1: 0.7458
Recall for class 2: 0.5339

+-----------------+-------+-------+-------+
|target_prediction|    0.0|    1.0|    2.0|
+-----------------+-------+-------+-------+
|              1.0| 798139|2478045|  46529|
|              0.0|3867348| 162010|  81459|
|              2.0| 790728| 156433|1084806|
+-----------------+-------+-------+-------+



In [None]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Test Results:
Macro F1: 0.7677
F1 for class 0: 0.7936
F1 for class 1: 0.8088
F1 for class 2: 0.6495
Precision for class 0: 0.7014
Precision for class 1: 0.8726
Precision for class 2: 0.8340
Recall for class 0: 0.9137
Recall for class 1: 0.7536
Recall for class 2: 0.5318

+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0| 344283|1124694| 23377|
|              0.0|1601582|  79153| 72205|
|              2.0| 337526|  85095|480077|
+-----------------+-------+-------+------+



# DT3


In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_index', 'EntityType_index', 'EvidenceRole_index',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
pipe_stages.append(assembler)

dt = DecisionTreeClassifier(labelCol="target", featuresCol="features", maxDepth=15, maxBins=33)

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [dt])

model = pipeline.fit(df_train)

train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [None]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

Train Results:
Macro F1: 0.8661
F1 for class 0: 0.8760
F1 for class 1: 0.8827
F1 for class 2: 0.8191
Precision for class 0: 0.8200
Precision for class 1: 0.9313
Precision for class 2: 0.8827
Recall for class 0: 0.9402
Recall for class 1: 0.8389
Recall for class 2: 0.7641

+-----------------+-------+-------+-------+
|target_prediction|    0.0|    1.0|    2.0|
+-----------------+-------+-------+-------+
|              1.0| 444004|2787281|  91428|
|              0.0|3864842| 130981| 114994|
|              2.0| 404631|  74663|1552673|
+-----------------+-------+-------+-------+



In [None]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Test Results:
Macro F1: 0.8302
F1 for class 0: 0.8430
F1 for class 1: 0.8562
F1 for class 2: 0.7625
Precision for class 0: 0.7949
Precision for class 1: 0.8990
Precision for class 2: 0.8046
Recall for class 0: 0.8972
Recall for class 1: 0.8173
Recall for class 2: 0.7245

+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0| 212508|1219723| 60123|
|              0.0|1572668|  81514| 98758|
|              2.0| 193169|  55487|654042|
+-----------------+-------+-------+------+



# DT4

Going to depth 30 since its worked so well. 30 is the maximum allowed depth in PySpark

In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_index', 'EntityType_index', 'EvidenceRole_index',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
pipe_stages.append(assembler)

dt = DecisionTreeClassifier(labelCol="target", featuresCol="features", maxDepth=30, maxBins=33)

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [dt])

model = pipeline.fit(df_train)

train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [None]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

Train Results:
Macro F1: 0.9731
F1 for class 0: 0.9743
F1 for class 1: 0.9760
F1 for class 2: 0.9660
Precision for class 0: 0.9663
Precision for class 1: 0.9812
Precision for class 2: 0.9742
Recall for class 0: 0.9825
Recall for class 1: 0.9709
Recall for class 2: 0.9579

+-----------------+-------+-------+-------+
|target_prediction|    0.0|    1.0|    2.0|
+-----------------+-------+-------+-------+
|              1.0|  75412|3225929|  21372|
|              0.0|4039037|  41647|  30133|
|              2.0|  65466|  20076|1946425|
+-----------------+-------+-------+-------+



In [None]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Test Results:
Macro F1: 0.8572
F1 for class 0: 0.8671
F1 for class 1: 0.8758
F1 for class 2: 0.8075
Precision for class 0: 0.8587
Precision for class 1: 0.8818
Precision for class 2: 0.8140
Recall for class 0: 0.8756
Recall for class 1: 0.8699
Recall for class 2: 0.8010

+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0| 140694|1298177| 53483|
|              0.0|1534955| 106241|111744|
|              2.0| 111873|  67754|723071|
+-----------------+-------+-------+------+



# DT5

I will try to change min instances now. Going extreme first with 500.

In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_index', 'EntityType_index', 'EvidenceRole_index',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
pipe_stages.append(assembler)

dt = DecisionTreeClassifier(labelCol="target", featuresCol="features", maxDepth=30, minInstancesPerNode=500, maxBins=33)

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [dt])

model = pipeline.fit(df_train)

train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [None]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()


Train Results:
Macro F1: 0.9106
F1 for class 0: 0.9187
F1 for class 1: 0.9194
F1 for class 2: 0.8797
Precision for class 0: 0.9026
Precision for class 1: 0.9297
Precision for class 2: 0.8969
Recall for class 0: 0.9353
Recall for class 1: 0.9094
Recall for class 2: 0.8631



In [None]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

Test Results:
Macro F1: 0.8592
F1 for class 0: 0.8690
F1 for class 1: 0.8813
F1 for class 2: 0.8038
Precision for class 0: 0.8589
Precision for class 1: 0.8878
Precision for class 2: 0.8128
Recall for class 0: 0.8792
Recall for class 1: 0.8748
Recall for class 2: 0.7951


In [None]:
print('Train')
train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

print('Test')
test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Train
+-----------------+-------+-------+-------+
|target_prediction|    0.0|    1.0|    2.0|
+-----------------+-------+-------+-------+
|              1.0| 220497|3021774|  80442|
|              0.0|3844862| 144712| 121243|
|              2.0| 194295|  83871|1753801|
+-----------------+-------+-------+-------+

Test
+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0| 135348|1305558| 51448|
|              0.0|1541268|  97843|113829|
|              2.0| 117836|  67163|717699|
+-----------------+-------+-------+------+



# DT6

Going even heavier into minInstancesPerNode

In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_index', 'EntityType_index', 'EvidenceRole_index',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
pipe_stages.append(assembler)

dt = DecisionTreeClassifier(labelCol="target", featuresCol="features", maxDepth=30, minInstancesPerNode=1000, maxBins=33)

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [dt])

model = pipeline.fit(df_train)

train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [None]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

Train Results:
Macro F1: 0.8963
F1 for class 0: 0.9062
F1 for class 1: 0.9069
F1 for class 2: 0.8588
Precision for class 0: 0.8865
Precision for class 1: 0.9191
Precision for class 2: 0.8811
Recall for class 0: 0.9268
Recall for class 1: 0.8951
Recall for class 2: 0.8376


In [None]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

Test Results:
Macro F1: 0.8539
F1 for class 0: 0.8614
F1 for class 1: 0.8760
F1 for class 2: 0.8028
Precision for class 0: 0.8514
Precision for class 1: 0.8803
Precision for class 2: 0.8152
Recall for class 0: 0.8716
Recall for class 1: 0.8717
Recall for class 2: 0.7907


In [None]:
print('Train')
train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

print('Test')
test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Train
+-----------------+-------+-------+-------+
|target_prediction|    0.0|    1.0|    2.0|
+-----------------+-------+-------+-------+
|              1.0| 251545|2974138|  97030|
|              0.0|3810073| 168041| 132703|
|              2.0| 236247|  93709|1702011|
+-----------------+-------+-------+-------+

Test
+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0| 142755|1300870| 48729|
|              0.0|1527881| 111988|113071|
|              2.0| 123989|  64963|713746|
+-----------------+-------+-------+------+



# Random Forest Classifier

Trying fewer trees to avoid crashing. Starting with 5 and using best DT results

In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_index', 'EntityType_index', 'EvidenceRole_index',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
pipe_stages.append(assembler)

rf = RandomForestClassifier(labelCol="target", featuresCol="features", numTrees=5, maxDepth=30, minInstancesPerNode=500, maxBins=33)

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [rf])

model = pipeline.fit(df_train)

train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [None]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

Train Results:
Macro F1: 0.8723
F1 for class 0: 0.8800
F1 for class 1: 0.8885
F1 for class 2: 0.8300
Precision for class 0: 0.8161
Precision for class 1: 0.9393
Precision for class 2: 0.9185
Recall for class 0: 0.9547
Recall for class 1: 0.8429
Recall for class 2: 0.7571

+-----------------+-------+-------+-------+
|target_prediction|    0.0|    1.0|    2.0|
+-----------------+-------+-------+-------+
|              1.0| 467680|2800562|  54471|
|              0.0|3924795| 103981|  82041|
|              2.0| 416672|  76846|1538449|
+-----------------+-------+-------+-------+



In [None]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Test Results:
Macro F1: 0.8389
F1 for class 0: 0.8463
F1 for class 1: 0.8693
F1 for class 2: 0.7741
Precision for class 0: 0.7859
Precision for class 1: 0.9182
Precision for class 2: 0.8459
Recall for class 0: 0.9168
Recall for class 1: 0.8254
Recall for class 2: 0.7135

+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0| 226172|1231826| 34356|
|              0.0|1607143|  62860| 82937|
|              2.0| 211694|  46908|644096|
+-----------------+-------+-------+------+



# Logistic Regression

In [None]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

pipe_stages.append(OneHotEncoder(inputCols=[c+'_index' for c in ['Category', 'EntityType', 'EvidenceRole']], outputCols=[c+'_ohe' for c in ['Category', 'EntityType', 'EvidenceRole']]))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_ohe', 'EntityType_ohe', 'EvidenceRole_ohe',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']



assembler = VectorAssembler(inputCols=feature_cols, outputCol='features_raw')

scaler = MinMaxScaler(inputCol="features_raw", outputCol="features")

lr = LogisticRegression(featuresCol='features', labelCol='target')

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [assembler, scaler, lr])

model = pipeline.fit(df_train)

In [None]:
train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

In [None]:
evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [None]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

Train Results:
Macro F1: 0.6114
F1 for class 0: 0.7038
F1 for class 1: 0.6705
F1 for class 2: 0.3278
Precision for class 0: 0.6038
Precision for class 1: 0.7070
Precision for class 2: 0.6219
Recall for class 0: 0.8434
Recall for class 1: 0.6376
Recall for class 2: 0.2225

+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0|1095921|2118666|108126|
|              0.0|3466871| 477215|166731|
|              2.0|1178822| 401008|452137|
+-----------------+-------+-------+------+



In [None]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Test Results:
Macro F1: 0.6276
F1 for class 0: 0.7032
F1 for class 1: 0.6964
F1 for class 2: 0.3670
Precision for class 0: 0.6051
Precision for class 1: 0.7306
Precision for class 2: 0.6469
Recall for class 0: 0.8393
Recall for class 1: 0.6654
Recall for class 2: 0.2562

+-----------------+-------+------+------+
|target_prediction|    0.0|   1.0|   2.0|
+-----------------+-------+------+------+
|              1.0| 452870|992938| 46546|
|              0.0|1471230|202010| 79700|
|              2.0| 507275|164176|231247|
+-----------------+-------+------+------+



In [None]:
test_prediction.show(5)

+-------------+-----+----------+-------+-------------------+----------+----------+-----------------+--------------+-----------------+------------+--------+------+---------+------+----------+----------+---------------+-----------+----------+----------------+-----------+-----------------+-----------------+-------------+---------------+------------------+--------+----------+--------------+--------+---------+-----------+-----+-----+----+-----+---+----+------+------+-----------+------------+--------------+----------------+------------------+------+---------------+---------------+----------------+--------------------+--------------------+--------------------+--------------------+----------+
|           Id|OrgId|IncidentId|AlertId|          Timestamp|DetectorId|AlertTitle|         Category| IncidentGrade|       EntityType|EvidenceRole|DeviceId|Sha256|IpAddress|   Url|AccountSid|AccountUpn|AccountObjectId|AccountName|DeviceName|NetworkMessageId|RegistryKey|RegistryValueName|RegistryValueData|A

# LR with Robust Scaler

In [5]:
pipe_stages = [StringIndexer(inputCol=c, outputCol=c+'_index') for c in ['Category', 'EntityType', 'EvidenceRole']]
pipe_stages.append(StringIndexer(inputCol='IncidentGrade', outputCol='target'))

pipe_stages.append(OneHotEncoder(inputCols=[c+'_index' for c in ['Category', 'EntityType', 'EvidenceRole']], outputCols=[c+'_ohe' for c in ['Category', 'EntityType', 'EvidenceRole']]))

feature_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId',
                'AlertTitle', 'Category_ohe', 'EntityType_ohe', 'EvidenceRole_ohe',
                'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
                'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
                'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
                'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
                'City', 'year', 'month', 'day', 'hour', 'minute', 'second',
                'day_of_week', 'week_of_year']



assembler = VectorAssembler(inputCols=feature_cols, outputCol='features_raw')

scaler = RobustScaler(inputCol="features_raw", outputCol="features")

lr = LogisticRegression(featuresCol='features', labelCol='target')

pipeline = Pipeline(stages=[timestamp_transformer] + pipe_stages + [assembler, scaler, lr])

model = pipeline.fit(df_train)

train_prediction = model.transform(df_train)
test_prediction = model.transform(df_test)

evaluator_macrof1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="fMeasureByLabel")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="precisionByLabel")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="recallByLabel")

In [6]:
print("Train Results:")
macrof1 = evaluator_macrof1.evaluate(train_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(train_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(train_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(train_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

train_crosstab = train_prediction.stat.crosstab("target", "prediction")
train_crosstab.show()

Train Results:
Macro F1: 0.5612
F1 for class 0: 0.6680
F1 for class 1: 0.6177
F1 for class 2: 0.2525
Precision for class 0: 0.5906
Precision for class 1: 0.6017
Precision for class 2: 0.5472
Recall for class 0: 0.7689
Recall for class 1: 0.6346
Recall for class 2: 0.1641

+-----------------+-------+-------+------+
|target_prediction|    0.0|    1.0|   2.0|
+-----------------+-------+-------+------+
|              1.0|1084913|2108498|129302|
|              0.0|3160607| 803503|146707|
|              2.0|1106169| 592267|333531|
+-----------------+-------+-------+------+



In [7]:
print("Test Results:")
macrof1 = evaluator_macrof1.evaluate(test_prediction)
print(f"Macro F1: {macrof1:.4f}")

f1_class_0 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 0})
print(f"F1 for class 0: {f1_class_0:.4f}")
f1_class_1 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 1})
print(f"F1 for class 1: {f1_class_1:.4f}")
f1_class_2 = evaluator_f1.evaluate(test_prediction, {evaluator_f1.metricLabel: 2})
print(f"F1 for class 2: {f1_class_2:.4f}")

precision_class_0 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 0})
print(f"Precision for class 0: {precision_class_0:.4f}")
precision_class_1 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 1})
print(f"Precision for class 1: {precision_class_1:.4f}")
precision_class_2 = evaluator_precision.evaluate(test_prediction, {evaluator_precision.metricLabel: 2})
print(f"Precision for class 2: {precision_class_2:.4f}")


recall_class_0 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 0})
print(f"Recall for class 0: {recall_class_0:.4f}")
recall_class_1 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 1})
print(f"Recall for class 1: {recall_class_1:.4f}")
recall_class_2 = evaluator_recall.evaluate(test_prediction, {evaluator_recall.metricLabel: 2})
print(f"Recall for class 2: {recall_class_2:.4f}")

print()

test_crosstab = test_prediction.stat.crosstab("target", "prediction")
test_crosstab.show()

Test Results:
Macro F1: 0.5664
F1 for class 0: 0.6680
F1 for class 1: 0.6362
F1 for class 2: 0.2537
Precision for class 0: 0.5925
Precision for class 1: 0.6109
Precision for class 2: 0.5635
Recall for class 0: 0.7655
Recall for class 1: 0.6637
Recall for class 2: 0.1637

+-----------------+-------+------+------+
|target_prediction|    0.0|   1.0|   2.0|
+-----------------+-------+------+------+
|              1.0| 446995|990427| 54932|
|              0.0|1341808|351588| 59544|
|              2.0| 475791|279134|147773|
+-----------------+-------+------+------+

