# ML on Electra Dataset using PySpark
###### *Samruddhi Khairnar*

#### 1. Downloading the Electra Dataset

In [None]:
!wget http://perception.inf.um.es/ICS-datasets/csv/electra_modbus.zip

--2023-05-30 15:40:02--  http://perception.inf.um.es/ICS-datasets/csv/electra_modbus.zip
Resolving perception.inf.um.es (perception.inf.um.es)... 155.54.204.141
Connecting to perception.inf.um.es (perception.inf.um.es)|155.54.204.141|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57856090 (55M) [application/zip]
Saving to: ‘electra_modbus.zip.1’


2023-05-30 15:40:04 (21.9 MB/s) - ‘electra_modbus.zip.1’ saved [57856090/57856090]



In [None]:
!pip install unzip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!unzip electra_modbus.zip

Archive:  electra_modbus.zip
replace electra_modbus.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

#### 2. Loading the Electra Dataset

In [None]:
import pandas as pd

In [None]:
df_p = pd.read_csv('electra_modbus.csv')

In [None]:
df_p.isnull().sum().sum()

0

#### 3. Printing the unique values for the differentiating features

![](https://drive.google.com/uc?export=view&id=1hm0dXZyhNaY9qjRnND4g0iiO-RRVux8J)

In [None]:
{i:j for i,j in enumerate(df_p.smac.unique())}

{0: '00:1b:1b:c1:41:1b',
 1: '08:00:27:79:b0:4a',
 2: '00:0e:8c:e1:dd:58',
 3: '00:0e:8c:e1:de:9c'}

In [None]:
{i:j for i,j in enumerate(df_p.dmac.unique())}

{0: '08:00:27:79:b0:4a',
 1: '00:0e:8c:e1:dd:58',
 2: '00:0e:8c:e1:de:9c',
 3: '00:1b:1b:c1:41:1b'}

In [None]:
{i:j for i,j in enumerate(df_p.sip.unique())}

{0: '10.70.38.51', 1: '10.70.38.55', 2: '10.70.38.56', 3: '10.70.38.131'}

In [None]:
{i:j for i,j in enumerate(df_p.dip.unique())}

{0: '10.70.38.55', 1: '10.70.38.56', 2: '10.70.38.51', 3: '10.70.38.131'}

In [None]:
{i:j for i,j in enumerate(df_p.label.unique())}

{0: 'MITM_UNALTERED',
 1: 'NORMAL',
 2: 'RESPONSE_ATTACK',
 3: 'FORCE_ERROR_ATTACK',
 4: 'RECOGNITION_ATTACK',
 5: 'WRITE_ATTACK',
 6: 'READ_ATTACK',
 7: 'REPLAY_ATTACK'}

#### 4. Loading the dataset using Pyspark

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LinearSVC, OneVsRest, OneVsRestModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

#### 5. Creating a Spark session

In [None]:
spark = SparkSession.builder.master("local").appName("electra").getOrCreate()

#### 6. Creating a Schema to load the CSV dataset

In [None]:
electraSchema = StructType() \
                        .add("Time", "integer")\
                        .add("smac", "string")\
                        .add("dmac", "string")\
                        .add("sip", "string")\
                        .add("dip", "string")\
                        .add("request", "integer")\
                        .add("fc", "integer")\
                        .add("error", "integer")\
                        .add("address", "integer")\
                        .add("data", "integer")\
                        .add("label", "string")

#### 7. Loading the electra-modbus.csv, from the electra directory in HDFS
##### No need to specify the file name, dir name works

In [None]:
data = spark.read.option("header","true").csv("hdfs://localhost:9000/electra",schema=electraSchema)

In [None]:
data.show(1)

+----+-----------------+-----------------+-----------+-----------+-------+---+-----+-------+----+--------------+
|Time|             smac|             dmac|        sip|        dip|request| fc|error|address|data|         label|
+----+-----------------+-----------------+-----------+-----------+-------+---+-----+-------+----+--------------+
|   0|00:1b:1b:c1:41:1b|08:00:27:79:b0:4a|10.70.38.51|10.70.38.55|      1|  3|    0|      1|  68|MITM_UNALTERED|
+----+-----------------+-----------------+-----------+-----------+-------+---+-----+-------+----+--------------+
only showing top 1 row



#### 8. Pipeline for ML training - OneHotEncoding + training SVM using OnevsRest

In [None]:
# Mapping string columns to their label indices
indexer = StringIndexer(inputCols=['smac','dmac','sip','dip','label'], outputCols=['smac_d','dmac_d','sip_d','dip_d','label_d'])

# Mapping categorical columns of label indices to binary columns
encoder = OneHotEncoder(inputCols=['smac_d','dmac_d','sip_d','dip_d'], outputCols=['smac_en','dmac_en','sip_en','dip_en'])

# Creating a vector for ML training
va = VectorAssembler(inputCols = ['Time', 'smac_en', 'dmac_en', 'sip_en', 'dip_en', 'request', 'fc', 'error', 'address', 'data'], outputCol='features', handleInvalid='skip')

# SVM using OneVsRest approach
lsvc = LinearSVC()
ovr = OneVsRest(classifier=lsvc, featuresCol='features', labelCol="label_d")

In [None]:
pipeline = Pipeline(stages = [indexer, encoder, va, ovr])

In [None]:
model = pipeline.fit(data)

In [None]:
data.show(1)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(18,[5,9,13,14,16...|  1.0|
+--------------------+-----+
only showing top 1 row



#### 9. Saving the trained model for prediction

In [None]:
model.write().overwrite().save('electra-model')

#### 10. Evaluating the model accuracy



In [None]:
train, test = data.randomSplit([0.9, 0.1])

In [None]:
train.show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
|(18,[0,1,4,7,10,1...|  0.0|
+--------------------+-----+
only showing top 10 rows



In [None]:
pred = model.transform(test)

pred.select("features", "label", "prediction").show(10)

eval =  MulticlassClassificationEvaluator().setMetricName('accuracy')
accuracy = eval.evaluate(pred)
print('Test accuracy = ',accuracy)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
|(18,[0,1,4,7,10,1...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 10 rows

Test accuracy =  0.9975959398094559


#### 8. Loading the trained model

In [None]:
model = OneVsRestModel.load('electra-model')