In [1]:
from sklearn import linear_model

import numpy as np
import pandas as pnd

from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer


columns = ["class", "message"]

def convertToBool(el):
    return 0 if el == "ham" else 1

converters = {"class": convertToBool}

df = pnd.read_csv("file:///home/cloudera/workspace/spark-streaming/data/SMSSpamCollection", 
              sep="\t",
              converters = converters,
              names=columns)

ngram = 1

tf_idf_model = TfidfVectorizer(min_df=1, ngram_range=(1,ngram))
tf_idf = tf_idf_model.fit_transform(df["message"]) 

lr_model = LogisticRegression(penalty="l2", fit_intercept=True, max_iter=100, C=1389.495494, 
                                    solver="lbfgs", random_state=12345)

lr_model.fit(tf_idf, df["class"])

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create Spark Context
sc = SparkContext(appName="SpamClassification")

#sc.setLogLevel("INFO")

tf_idf_model_broadcast = sc.broadcast(tf_idf_model)
lr_model_broadcast = sc.broadcast(lr_model)

# Create Streaming Context
ssc = StreamingContext(sc, 10)

# Create a stream
lines = ssc.socketTextStream("localhost", 9999)

tf_idf_messages = lines.map(lambda row: (row, tf_idf_model_broadcast.value.transform([row])))

def predic_message_class(message_tf_idf):
    pred = lr_model_broadcast.value.predict(message_tf_idf)
    return "spam" if pred[0] else "ham"

predictions = tf_idf_messages.map(lambda row: (row[0], predic_message_class(row[1])))

# Print the result (10 records)
predictions.pprint()
#predictions.transform(lambda rdd: rdd.coalesce(1)).saveAsTextFiles("file:///home/cloudera/workspace/spark-streaming/output/wordCount")

# Start Spark Streaming
ssc.start()

# Await terminiation
ssc.awaitTermination()