In [1]:
import pyspark
from pyspark import SparkContext, SparkFiles
from pyspark.sql import SQLContext,Row 
from pyspark.sql.functions import udf
from pyspark.ml.feature import CountVectorizer, VectorAssembler,Tokenizer,IDF
from pyspark.mllib.linalg import VectorUDT, DenseVector
import pandas as pd
# sc =SparkContext()
url = 'https://github.com/grananqvist/Machine-Learning-Web-Application-Firewall-and-Dataset/raw/master/data/payloads.csv'
sc.addFile(url)
sqlc = SQLContext(sc)
sc

In [51]:
df_pd = pd.read_csv(SparkFiles.get("payloads.csv"))
df_pd = df_pd.fillna("")

df = sqlc.createDataFrame(df_pd)

# Data Preprocessing
   - dataset columns: index,payload,is_maliscious
   - first step is turning the payload(type string) into numerical values(vectors)

In [76]:
def get_ngrams(payload_obj,n=1):
    payload = str(payload_obj)
    ngrams = []
    for i in range(0,len(payload)-n+1):
        ngrams.append(payload[i:i+n])
    return " ".join(ngrams)

get_ngrams("<script>",2)

'<s sc cr ri ip pt t>'

In [77]:
def payload_ngram(row,n=1):
    row_dict = row.asDict()
    row_dict[f'payload_{n}gram'] = get_ngrams(row_dict['payload'].lower(),n=n) # payload_cl - payload char list
    newrow = Row(**row_dict)
    return newrow

n=1
payload_gram = f'payload_{n}gram'
df_rdd = df.rdd.map(lambda row: payload_ngram(row,n))
print('created grams')
print(df_rdd.map(lambda row : row[payload_gram]).first())

created grams
3 7 6 6 2 5 7 7 p


In [78]:
df = sqlc.createDataFrame(df_rdd)
df = df.select("index",'payload',payload_gram,'is_malicious')

In [79]:

tokenizer = Tokenizer(inputCol=payload_gram, outputCol="tokens")
wordsData = tokenizer.transform(df)

vectorizer = CountVectorizer(inputCol='tokens', outputCol='vectorizer').fit(wordsData)
wordsData = vectorizer.transform(wordsData)

idf = IDF(inputCol='vectorizer', outputCol='tfidf_features')
idf_model = idf.fit(wordsData)
wordsData = idf_model.transform(wordsData)

def to_dense(in_vec):
    return DenseVector(in_vec.toArray())

to_dense_udf = udf(lambda x: to_dense(x), VectorUDT())

# create dense vector
wordsData = wordsData.withColumn("tfidf_features_dense", to_dense_udf('tfidf_features'))



In [84]:
X = wordsData.rdd.map(lambda row: row['tfidf_features_dense'])
y = wordsData.rdd.map(lambda row: row['is_malicious'])

In [85]:
y.first()

0.0

In [86]:
X.first()

DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7647, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.5434, 1.8923, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.1693, 0.0, 4.463, 6.5971, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

# Training models