In [16]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml import Pipeline
import pandas as pd

In [17]:
#Creating sparkContext and sparkSession
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
spark = SparkSession(sc)

In [18]:
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("./datasets/training.csv"))

In [19]:
columns  = df.columns

dataset = df.select(col(columns[0]).cast('float'),
                    col(columns[1]).cast('float'),
                    col(columns[2]).cast('string'),
                    col(columns[3]).cast('float'),
                   )

In [20]:
indexer = StringIndexer(inputCol="code", outputCol="codeIndex")
encoder = OneHotEncoder(inputCol="codeIndex", outputCol="codeVec")
vectorAssembler = VectorAssembler(inputCols=['value','codeVec'],
                                  outputCol="features")
#normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)
scaler = StandardScaler(inputCol="features", outputCol="features_norm")
pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler])
model = pipeline.fit(dataset)

In [21]:
path= r'./SavedModel/lr_model'
lr_model = CrossValidatorModel.read().load(path)
lr = lr_model.bestModel

In [22]:
label = []
prediction = []

In [23]:
def tp(label,prediction) :
    '''
    Positive class is 0
    '''
    total_positive_instances_indexes = label[label.values == 0.0].index
    prediction_instances = prediction[total_positive_instances_indexes]
    true_labels= prediction_instances[prediction_instances.values == 0.0]
    return len(true_labels)


def tn(label,prediction) :
    '''
    Negative Class is 1
    '''
    total_negative_instances_indexes = label[label.values == 1.0].index
    prediction_instances = prediction[total_negative_instances_indexes]
    true_labels= prediction_instances[prediction_instances.values == 1.0]
    return len(true_labels)


def fp(label,prediction) :

    '''
    Positive class is 0
    '''
    total_positive_instances_indexes = label[label.values == 0.0].index
    prediction_instances = prediction[total_positive_instances_indexes]
    true_labels= prediction_instances[prediction_instances.values == 0.0]
    return len(prediction_instances) - len(true_labels)


def fn(label,prediction) :
    '''
    Negative Class is 1
    '''
    total_negative_instances_indexes = label[label.values == 1.0].index
    prediction_instances = prediction[total_negative_instances_indexes]
    true_labels= prediction_instances[prediction_instances.values == 1.0]
    return len(prediction_instances) - len(true_labels)

In [24]:
def accuracy(labels,predictions) :
    _tp = tp(labels,predictions)
    _tn = tn(labels,predictions)
    _fp = fp(labels,predictions)
    _fn = fn(labels,predictions)
    
    return ((_tp+_tn)/(_tp+_tn+_fp+_fn))

def precision(labels,predictions) : 
    _tp = tp(labels,predictions)
    _fp = fp(labels,predictions)
    
    return (_tp/(_tp+_fp))

def recall(labels,predictions) : 
    _tp = tp(labels,predictions)
    _fn = fn(labels,predictions)
    
    return (_tp / (_tp+_fn))


def f1_score(labels,predictions) :
    pr = precision(labels,predictions)
    re = recall(labels,predictions)
    
    return ((2*pr*re)/(pr+re))

In [25]:
def test_train(text) :
    #i =0
    global lr,prediction,label
    if text.collect() != [] :
        for data in text.collect() :
            data = data.split('|')
            #print(data)
            feature = [float(data[0]),float(data[1]),data[2],float(data[3])]
            #print(feature)
            #print(feature)
            Dframe = sc.parallelize([feature]).toDF(('timestamp','value','code','Class'))
            df = model.transform(Dframe)
            #print('---->',x.collect())
            #print(x.select(['features_norm']).collect())
            tempdf = lr.transform(df)
            #if prediction is None and label is None : 
            prediction.append(tempdf.toPandas().pred_lr.values[0])
            label.append(tempdf.toPandas().Class.values[0])
            #print('pred :', prediction)
            #print('label :',label)
            #i+=1
            #print(i)
    else :
        #print('Empty!!!!')
        try :
            ssc.stop()
        except Py4JJavaError() as err:
            print(err)
            print('Streaming Stopped')

In [26]:
ssc = StreamingContext(sc, 2)
lines = ssc.socketTextStream('localhost', 9995)
lines.foreachRDD(test_train)

In [27]:
ssc.start()             # Start the streaming process
ssc.awaitTermination()  # Wait for the streaming to end

Empty!!!!


In [28]:
def classification_report(label,prediction) : 
    label_ = pd.Series(label)
    prediction_ = pd.Series(prediction)
    print('Accuracy : {}'.format(accuracy(label_,prediction_)))
    print('Precision : {}'.format(precision(label_,prediction_)))
    print('Recall : {}'.format(recall(label_,prediction_)))
    print('f1-score : {}'.format(f1_score(label_,prediction_)))

In [29]:
classification_report(label,prediction)

Accuracy : 1.0
Precision : 1.0
Recall : 1.0
f1-score : 1.0
