In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark import SparkFiles

# sc =SparkContext()
url = 'https://github.com/grananqvist/Machine-Learning-Web-Application-Firewall-and-Dataset/raw/master/data/payloads.csv'
sc.addFile(url)
sqlc = SQLContext(sc)
sc

In [2]:
import pandas as pd

df_pd = pd.read_csv(SparkFiles.get("payloads.csv"))
df_pd = df_pd.fillna("")

df = sqlc.createDataFrame(df_pd)

In [3]:
def get_ngrams(payload_obj,n=1):
    payload = str(payload_obj)
    ngrams = []
    for i in range(0,len(payload)-n+1):
        ngrams.append(payload[i:i+n])
    return " ".join(ngrams)

def payload_ngram(row,n=1):
    row_dict = row.asDict()
    row_dict[f'payload_{n}gram'] = get_ngrams(row_dict['payload'],n=n) # payload_cl - payload char list
    newrow = Row(**row_dict)
    return newrow

n=1
payload_gram = f'payload_{n}gram'
df_rdd = df.rdd.map(lambda row: payload_ngram(row,n))
df  = sqlc.createDataFrame(df_rdd)
df = df.select("index",'payload','is_malicious',payload_gram)

# TFIDF - With no RDD's

In [None]:
# from pyspark.mllib.feature import HashingTF
from pyspark.mllib.linalg import VectorUDT, DenseVector
from pyspark.sql.functions import udf
from pyspark.ml.feature import NGram, CountVectorizer, VectorAssembler,Tokenizer,IDF

# tokenizer
tokenizer = Tokenizer(inputCol=payload_gram, outputCol="words")
wordsData = tokenizer.transform(df)

# vectorizer
vectorizer = CountVectorizer(inputCol='words', outputCol='vectorizer').fit(wordsData)
wordsData = vectorizer.transform(wordsData)

# calculate scores
idf = IDF(inputCol="vectorizer", outputCol="tfidf_features")
idf_model = idf.fit(wordsData)
wordsData = idf_model.transform(wordsData)

# dense the current response variable
def to_dense(in_vec):
    return DenseVector(in_vec.toArray())

to_dense_udf = udf(lambda x: to_dense(x), VectorUDT())

# create dense vector
wordsData = wordsData.withColumn("tfidf_features_dense", to_dense_udf('tfidf_features'))



# TFIDF - With RDD's

In [4]:
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.ml.feature import Tokenizer
from pyspark.mllib.linalg import VectorUDT, DenseVector
from pyspark.sql.functions import udf


tokenizer = Tokenizer(inputCol=payload_gram, outputCol="words")
wordsData = tokenizer.transform(df)

In [16]:
df_rdd2 = wordsData.select('index','words')

In [17]:
df_rdd2.show()

+-----+--------------------+
|index|               words|
+-----+--------------------+
|    0|[3, 7, 6, 6, 2, 5...|
|    1|[s, h, i, r, t, i...|
|    2|[&, k, w, =, %, 2...|
|    3|[o, b, e, y, i, n...|
|    4|[d, i, c, t, a, t...|
|    5|[l, a, f, l, e, u...|
|    6|[c, a, p, t, u, r...|
|    7|[8, n, c, a, 5, 8...|
|    8|[a, u, t, o, c, r...|
|    9|[g, r, o, c, e, r...|
|   10|[d, a, n, c, i, e...|
|   11|[k, o, r, e, s, s...|
|   12|[b, o, w, i, e, 0...|
|   13|[a, u, q, u, e, +...|
|   14|[b, r, o, a, d, e...|
|   15|[x, a, u, b, e, t...|
|   16|[v, o, c, a, t, i...|
|   17|     [y, e, m, e, n]|
|   18|     [2, 2, 5, 8, 4]|
|   19|[<, b, r, , , s, ...|
+-----+--------------------+
only showing top 20 rows

