In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark import SparkFiles
url="https://raw.githubusercontent.com/IPGreene/FW-Neural-net/master/ASA_log.csv"
spark.sparkContext.addFile(url)
data = spark.read.csv(SparkFiles.get("ASA_log.csv"), header=True)
# inspect the data - i've decided to remove categorydescription and even

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StandardScaler
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import rand
from sklearn.model_selection import train_test_split
import pandas as pd

In [6]:
data.createOrReplaceTempView("firewall")
data = spark.sql('SELECT * FROM firewall')
data = data.withColumn('sourcePort', data['sourcePort'].cast(IntegerType()))
data = data.withColumn('destinationPort', data['destinationPort'].cast(IntegerType()))
data = data.withColumn('deviceId', data['deviceId'].cast(IntegerType()))
data = data.withColumn('event_category', data['event_category'].cast(IntegerType()))
data = data.withColumn('relevance', data['relevance'].cast(IntegerType()))
data = data.withColumn('credibility', data['credibility'].cast(IntegerType()))
data = data.withColumn('severity', data['severity'].cast(IntegerType()))
data = data.withColumn('magnitude', data['magnitude'].cast(IntegerType()))
#data = data.withColumn('eventCount', data['eventCount'].cast(IntegerType()))
data = data.drop('Event_DateTime', 'categoryDescription', 'eventDescription', 'eventCount')
data = data.fillna('Unknown')
data.show()

+----------+---------------+------------+-------------+--------+--------------+---------+-----------+--------+---------+
|sourcePort|destinationPort|protocolName|        IPgeo|deviceId|event_category|relevance|credibility|severity|magnitude|
+----------+---------------+------------+-------------+--------+--------------+---------+-----------+--------+---------+
|     52217|           2000|      tcp_ip|      Unknown|   31410|          5010|        8|         10|       9|        9|
|     51405|             80|      tcp_ip|      Unknown|   31410|          7024|        8|         10|       1|        6|
|     36002|            445|      tcp_ip|      Unknown|   31410|          7024|        8|         10|       1|        6|
|     35074|            445|      tcp_ip|      Unknown|   31410|          7024|        8|         10|       1|        6|
|     55631|            443|      tcp_ip|United States|   31410|          4002|       10|         10|       0|        6|
|     55991|            443|    

In [7]:
str_col = ['protocolName', 'IPgeo']
indexer = [
    StringIndexer(inputCol=c, outputCol="{0}_index".format(c), handleInvalid="keep")
    for c in str_col
]
pipeline = Pipeline(stages=indexer)
model=pipeline.fit(data)
transformed=model.transform(data)
transformed = transformed.drop('protocolName', 'IPgeo')



In [8]:
transformed.printSchema()

root
 |-- sourcePort: integer (nullable = true)
 |-- destinationPort: integer (nullable = true)
 |-- deviceId: integer (nullable = true)
 |-- event_category: integer (nullable = true)
 |-- relevance: integer (nullable = true)
 |-- credibility: integer (nullable = true)
 |-- severity: integer (nullable = true)
 |-- magnitude: integer (nullable = true)
 |-- protocolName_index: double (nullable = false)
 |-- IPgeo_index: double (nullable = false)



In [9]:
train, test = transformed.randomSplit([0.70, 0.30], seed=1234)
x_train = train.drop('event_category')
y_train = train.select('event_category')
x_test = test.drop('event_category')
y_test = test.select('event_category')


In [10]:
x_train_pd = x_train.toPandas()
y_train_pd = y_train.toPandas()
x_test_pd = x_test.toPandas()
y_test_pd = y_test.toPandas()

In [32]:
y_train_pd.event_category.unique()


array([4002, 4003, 4015, 7024, 5010])

In [33]:
y_test_pd.event_category.unique()

array([4002, 4003, 7024, 4015])

In [26]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
#create model
model = Sequential()

#get number of columns in training data
n_cols = x_train_pd.shape[1]
print(n_cols)
#add model layers
model.add(Dense(250, activation='relu', input_shape=(n_cols,)))
model.add(Dense(250, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=3)
model.summary()
#train model
model.fit(x_train_pd, y_train_pd, validation_split=0.2, epochs=30)


9
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 250)               2500      
_________________________________________________________________
dense_28 (Dense)             (None, 250)               62750     
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 251       
Total params: 65,501
Trainable params: 65,501
Non-trainable params: 0
_________________________________________________________________
Train on 873 samples, validate on 219 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f7f8c067908>

In [27]:
test_y_predictions = model.predict(x_test_pd)
print(test_y_predictions)

[[  72.52179]
 [3976.2793 ]
 [3977.0276 ]
 [3980.1028 ]
 [3980.1028 ]
 [3980.1028 ]
 [4030.889  ]
 [3977.4604 ]
 [3978.919  ]
 [3978.919  ]
 [3978.919  ]
 [3978.919  ]
 [3978.919  ]
 [3978.919  ]
 [3978.905  ]
 [  78.43674]
 [3978.1719 ]
 [3980.379  ]
 [3980.379  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.986  ]
 [3977.9731 ]
 [3977.9731 ]
 [3977.9731 ]
 [3977.9731 ]
 [3977.9731 ]
 [3977.9731 ]
 [3977.9731 ]
 [3977.9731 ]
 [3952.0862 ]
 [3964.906  ]
 [3960.4094 ]
 [3960.3875 ]
 [3961.807  ]
 [3961.7954 ]
 [3918.2893 ]
 [3924.0166 ]
 [3960.1924 ]
 [3958.5454 ]
 [3965.8403 ]
 [3884.9639 ]
 [3992.8662 ]
 [3972.75   ]
 [3974.3137 ]
 [3971.61   ]
 [3960.6953 ]
 [3929.0967 ]
 [3936.476  ]
 [3939.0518 ]
 [3926.123  ]
 [3907.13   ]
 [3913.1833 ]
 [3875.9167 ]
 [3875.905  ]
 [3898.2598 ]
 [3868.9373 ]
 [3882.219  ]
 [3907.2314 ]
 [1569.8213 ]
 [3966.502  ]
 [3890.6763 ]
 [3885