In [None]:
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null
# !wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
# !tar xf spark-2.4.5-bin-hadoop2.7.tgz
# !rm spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# # path variables
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# find pyspark library
import findspark
findspark.init('/usr/local/spark/spark-2.4.0-bin-hadoop2.7/')



In [None]:
import pyspark
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType
from pyspark.sql.functions import udf
from pyspark.ml.feature import CountVectorizerModel, IDFModel, StandardScalerModel, Tokenizer
from pyspark.ml.classification import LogisticRegressionModel


from urllib.parse import unquote

In [None]:
APP_NAME = "BigData"
conf = pyspark.SparkConf().setAll([ ('spark.app.name', APP_NAME),('spark.executor.memory', '8g'), ('spark.cores.max', '2'), ('spark.driver.memory','8g')])
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)
sc

In [None]:
def to_ngram(payload_obj):
    n=2
    payload = str(payload_obj)
    ngrams = ''
    for i in range(0,len(payload)-n + 1):
        ngrams += payload[i:i+n]+ ' '
    return ngrams[:-1]

ngrams = udf(to_ngram, StringType())
tokenizer = Tokenizer.load('models/Tokenizer')
vectorizer = CountVectorizerModel.load('models/Vectorizer')
idf_model = IDFModel.load('models/idf')
scalerModel = StandardScalerModel.load('models/scalerModel')
model = LogisticRegressionModel.load('models/Logistic_Regression_Model')

In [None]:
queries = sc.parallelize(['Alice','bigdata',"hellworld","select* from students where '1'='1","<sctipt>alert('hacked');</sctipt>"]).map(lambda q: Row(payload=q))
sample_df = sqlc.createDataFrame(queries)

sample_df = sample_df.withColumn('ngrams', ngrams(sample_df['payload']))
sample_df = tokenizer.transform(sample_df)
sample_df = vectorizer.transform(sample_df)
sample_df = idf_model.transform(sample_df)
sample_df = scalerModel.transform(sample_df).cache()
preds = model.transform(sample_df)
sample_df.select('payload','scaledFeatures').show()

In [None]:
preds = model.transform(sample_df)
preds.select('payload','prediction').show(truncate=False)

In [None]:
def to_ngram(payload_obj):
    n=2
    payload = str(payload_obj)
    ngrams = ''
    for i in range(0,len(payload)-n + 1):
        ngrams += payload[i:i+n]+ ' '
    return ngrams[:-1]

ngrams = udf(to_ngram, StringType())

# define a function to compute sentiments of the received tweets
def get_prediction(queries):
    try:
        queries = queries.map(lambda w: Row(payload=w))
        queries = sqlc.createDataFrame(queries)

        queries = queries.withColumn('ngrams', ngrams(queries['payload']))
        queries = tokenizer.transform(queries)
        queries = vectorizer.transform(queries)
        queries = idf_model.transform(queries)
        queries = scalerModel.transform(queries)
        preds = model.transform(queries)
        preds.select('payload','prediction').show()
    except : 
        print('No data')
    

ssc = StreamingContext(sc, batchDuration= 3)
lines = ssc.socketTextStream("localhost", 9999)

lines.foreachRDD(get_prediction)

ssc.start()             

ssc.awaitTermination()