In [1]:

!pip install -q findspark

import findspark
findspark.init('/home/ec2-user/spark-2.4.5-bin-hadoop2.7')



In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType
from pyspark.sql.functions import udf
from pyspark.ml.feature import CountVectorizerModel, IDFModel, StandardScalerModel, Tokenizer
from pyspark.ml.classification import LogisticRegressionModel


from urllib.parse import unquote

In [3]:
APP_NAME = "BigData"
conf = pyspark.SparkConf().setAll([ ('spark.app.name', APP_NAME),('spark.executor.memory', '8g'), ('spark.cores.max', '2'), ('spark.driver.memory','8g')])
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)
sc

In [4]:
def to_ngram(payload_obj):
    n=2
    payload = str(payload_obj)
    ngrams = ''
    for i in range(0,len(payload)-n + 1):
        ngrams += payload[i:i+n]+ ' '
    return ngrams[:-1]

ngrams = udf(to_ngram, StringType())
tokenizer = Tokenizer.load('models/Tokenizer')
vectorizer = CountVectorizerModel.load('models/Vectorizer')
idf_model = IDFModel.load('models/idf')
scalerModel = StandardScalerModel.load('models/scalerModel')
model = LogisticRegressionModel.load('models/Logistic_Regression_Model')

In [5]:
queries = sc.parallelize(['Alice','bigdata',"hellworld","select* from students where '1'='1","<sctipt>alert('hacked');</sctipt>"]).map(lambda q: Row(payload=q))
sample_df = sqlc.createDataFrame(queries)

sample_df = sample_df.withColumn('ngrams', ngrams(sample_df['payload']))
sample_df = tokenizer.transform(sample_df)
sample_df = vectorizer.transform(sample_df)
sample_df = idf_model.transform(sample_df)
sample_df = scalerModel.transform(sample_df).cache()
preds = model.transform(sample_df)
sample_df.select('payload','scaledFeatures').show()

+--------------------+--------------------+
|             payload|      scaledFeatures|
+--------------------+--------------------+
|               Alice|(4472,[34,42,53,1...|
|             bigdata|(4472,[24,43,169,...|
|           hellworld|(4472,[23,75,86,9...|
|select* from stud...|(4472,[0,13,17,19...|
|<sctipt>alert('ha...|(4472,[3,6,7,13,2...|
+--------------------+--------------------+



In [6]:
preds = model.transform(sample_df)
preds.select('payload','prediction').show(truncate=False)

+----------------------------------+----------+
|payload                           |prediction|
+----------------------------------+----------+
|Alice                             |0.0       |
|bigdata                           |0.0       |
|hellworld                         |0.0       |
|select* from students where '1'='1|1.0       |
|<sctipt>alert('hacked');</sctipt> |1.0       |
+----------------------------------+----------+



In [None]:
def to_ngram(payload_obj):
    n=2
    payload = str(payload_obj)
    ngrams = ''
    for i in range(0,len(payload)-n + 1):
        ngrams += payload[i:i+n]+ ' '
    return ngrams[:-1]

ngrams = udf(to_ngram, StringType())

# define a function to compute sentiments of the received tweets
def get_prediction(queries):
    print(queries)
    try:
        queries = queries.map(lambda w: Row(payload=w))
        queries = sqlc.createDataFrame(queries)
        print(queries)
        queries = queries.withColumn('ngrams', ngrams(queries['payload']))
        queries = tokenizer.transform(queries)
        queries = vectorizer.transform(queries)
        queries = idf_model.transform(queries)
        queries = scalerModel.transform(queries)
        preds = model.transform(queries)
        preds.select('payload','prediction').show()
    except : 
        print('No data')
    

ssc = StreamingContext(sc, batchDuration= 3)
lines = ssc.socketTextStream("ec2-52-90-109-113.compute-1.amazonaws.com", 9999)

lines.foreachRDD(get_prediction)

ssc.start()             

ssc.awaitTermination()

BlockRDD[58] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[59] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[60] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[61] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[62] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[63] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[64] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[65] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[66] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[67] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[68] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[69] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[70] at socketTextStream at NativeMethodAccessorImpl.java:0
No data
BlockRDD[71]