In [0]:
### SMS classifier

#### By Matthieu Hanania & Karis Gwet

## change the text features into numeric using the suitable classes

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, IDF, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, NaiveBayes
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [0]:
#Loading the data

df = spark.read.csv("/FileStore/tables/SMSSpamCollection.csv", inferSchema=True, header = False,sep="\\t")
df = df.selectExpr("_c0 as label","_c1 as text")
df.show(2,truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------+
|label|text                                                                                                           |
+-----+---------------------------------------------------------------------------------------------------------------+
|ham  |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...|
|ham  |Ok lar... Joking wif u oni...                                                                                  |
+-----+---------------------------------------------------------------------------------------------------------------+
only showing top 2 rows



In [0]:
#Converting the class column to numeric using StringIndexer

indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df = indexer.fit(df).transform(df)
df.show(2,truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------+----------+
|label|text                                                                                                           |labelIndex|
+-----+---------------------------------------------------------------------------------------------------------------+----------+
|ham  |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...|0.0       |
|ham  |Ok lar... Joking wif u oni...                                                                                  |0.0       |
+-----+---------------------------------------------------------------------------------------------------------------+----------+
only showing top 2 rows



In [0]:
#Tokenizing the SMS column

tokenizer = Tokenizer(inputCol="text", outputCol="sms_words")
df = tokenizer.transform(df)
df.show(2)

+-----+--------------------+----------+--------------------+
|label|                text|labelIndex|           sms_words|
+-----+--------------------+----------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|
+-----+--------------------+----------+--------------------+
only showing top 2 rows



In [0]:
#Removing stop words from the SMS words column

remover = StopWordsRemover(inputCol="sms_words", outputCol="filtered_sms_words")
df = remover.transform(df)
df.show(2)

+-----+--------------------+----------+--------------------+--------------------+
|label|                text|labelIndex|           sms_words|  filtered_sms_words|
+-----+--------------------+----------+--------------------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|[go, jurong, poin...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|[ok, lar..., joki...|
+-----+--------------------+----------+--------------------+--------------------+
only showing top 2 rows



In [0]:
#Creating bag of words using CountVectorizer

count_vectorizer = CountVectorizer(inputCol="filtered_sms_words", outputCol="raw_features")
df = count_vectorizer.fit(df).transform(df)

df.show(2)

+-----+--------------------+----------+--------------------+--------------------+--------------------+
|label|                text|labelIndex|           sms_words|  filtered_sms_words|        raw_features|
+-----+--------------------+----------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|
+-----+--------------------+----------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [0]:
#Calculating the IDF

idf = IDF(inputCol="raw_features", outputCol="features")
df = idf.fit(df).transform(df)
df.show(2)

+-----+--------------------+----------+--------------------+--------------------+--------------------+--------------------+
|label|                text|labelIndex|           sms_words|  filtered_sms_words|        raw_features|            features|
+-----+--------------------+----------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|(13423,[0,24,301,...|
+-----+--------------------+----------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [0]:
#Creating a vector assembler to combine all features

assembler = VectorAssembler(inputCols=["features"], outputCol="features_vector")

## Train 4 classifiers

In [0]:
#Creating pipelines for the different classifiers

lr = LogisticRegression(featuresCol="features_vector", labelCol="labelIndex")
pipeline_lr = Pipeline(stages=[assembler, lr])

dt = DecisionTreeClassifier(featuresCol="features_vector", labelCol="labelIndex")
pipeline_dt = Pipeline(stages=[assembler, dt])

rf = RandomForestClassifier(featuresCol="features_vector", labelCol="labelIndex")
pipeline_rf = Pipeline(stages=[assembler, rf])

nb = NaiveBayes(featuresCol="features_vector", labelCol="labelIndex")
pipeline_nb = Pipeline(stages=[assembler, nb])

In [0]:
#Splitting the data into training and testing

training, testing = df.randomSplit([0.8, 0.2])

In [0]:
#Training the models using the pipelines

lr_model = pipeline_lr.fit(training)
dt_model = pipeline_dt.fit(training)
rf_model = pipeline_rf.fit(training)
nb_model = pipeline_nb.fit(training)

In [0]:
#Evaluating the models on the testing data

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="labelIndex", metricName="accuracy")

lr_accuracy = evaluator.evaluate(lr_model.transform(testing))
dt_accuracy = evaluator.evaluate(dt_model.transform(testing))
rf_accuracy = evaluator.evaluate(rf_model.transform(testing))
nb_accuracy = evaluator.evaluate(nb_model.transform(testing))

In [0]:
#Printing the accuracy of all models

print("Logistic Regression Accuracy: {}".format(lr_accuracy))
print("Decision Tree Accuracy: {}".format(dt_accuracy))
print("Random Forest Accuracy: {}".format(rf_accuracy))
print("Naive Bayes Accuracy: {}".format(nb_accuracy))

Logistic Regression Accuracy: 0.9868073878627969
Decision Tree Accuracy: 0.9173262972735268
Random Forest Accuracy: 0.8654353562005277
Naive Bayes Accuracy: 0.9067722075637643


### Tunning random forest

In [0]:
#Tuning the hyperparameters for Random Forest using ParamGridBuilder and CrossValidator

param_grid = (ParamGridBuilder().addGrid(rf.numTrees, [10, 20, 50]).addGrid(rf.maxDepth, [5, 10, 15]).build())

crossval = CrossValidator(estimator=pipeline_rf, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)
rf_model_tuned = crossval.fit(training)

#Evaluating the tuned model on the testing data
tuned_rf_accuracy = evaluator.evaluate(rf_model_tuned.transform(testing))

#Printing the accuracy of the tuned model
print("Tuned Random Forest Accuracy: {}".format(tuned_rf_accuracy))

Tuned Random Forest Accuracy: 0.9085312225153914


In [0]:
results = [('Logistic Regression Accuracy',lr_accuracy), ('Decision Tree Accuracy',dt_accuracy),
        ('Random Forest Accurac', rf_accuracy), ('Naive Bayes AccuracyNaive Bayes Accuracy', nb_accuracy),
        ('Tuned Random Forest Accuracy', tuned_rf_accuracy)]

df_results = sqlContext.createDataFrame(results,['model','accuracy'])
df_results.display()


model,accuracy
Logistic Regression Accuracy,0.9868073878627968
Decision Tree Accuracy,0.9173262972735268
Random Forest Accurac,0.8654353562005277
Naive Bayes AccuracyNaive Bayes Accuracy,0.9067722075637644
Tuned Random Forest Accuracy,0.9085312225153914
