### Reading the dataset from Spark local

In [1]:
import os

# Add asset from file system
textData = sc.textFile(os.environ['DSX_PROJECT_DIR']+'/datasets/SMSSpamCollection.csv')

### Creating a Spark data pipeline

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF    
from pyspark.ml.feature import Tokenizer
from pyspark.ml.classification import LogisticRegression
    
tokenizer = Tokenizer(inputCol="message",outputCol="words")
hashingTF = HashingTF(inputCol = tokenizer.getOutputCol(),outputCol="tempfeatures")
idf = IDF(inputCol = hashingTF.getOutputCol(),outputCol="features")
lrClassifier = LogisticRegression()
        
pipeline = Pipeline(stages=[tokenizer,hashingTF,idf,lrClassifier])

### Cleaning the Data

In [3]:
# creating a labeled vector
def TransformToVector(string):
    attList = string.split(",")
    smsType = 0.0 if attList[0] == "ham" else 1.0
    return [smsType,attList[1]]
        
textTransformed = textData.map(TransformToVector)
        
# creating a data frame from labeled vector
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
textDF = sqlContext.createDataFrame(textTransformed,["label","message"])


### Build your LR Model using SparkML

In [4]:
# split data frame into training and testing
(trainingData,testData) = textDF.randomSplit([0.9,0.1])
        
#Build a model with Pipeline
lrModel = pipeline.fit(trainingData)
        
#Compute Predictions
prediction = lrModel.transform(testData)
            
#Evaluate Accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                                                      labelCol="label", \
                                                      metricName = "accuracy")
accuracy = evaluator.evaluate(prediction)
print "Model Accuracy: " + str(round(accuracy*100,2))
        
# Draw a confusion matrix
prediction.groupby("label","prediction").count().show()

Model Accuracy: 87.64
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   36|
|  0.0|       1.0|    3|
|  1.0|       0.0|    8|
|  0.0|       0.0|   42|
+-----+----------+-----+

