# Spark NLP

Building a spam Detection using NLP.


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [None]:
data = spark.read.csv("DataSets/SMSSpamCollection",inferSchema=True,sep='\t')

In [None]:
data = data.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')

In [None]:
data.show()

## Clean and Prepare the Data

** Create a new length feature: **

In [None]:
from pyspark.sql.functions import length

In [None]:
data = data.withColumn('length',length(data['text']))

In [None]:
data.show()
data.head()

In [None]:
# Pretty Clear Difference
data.groupby('class').mean().show()

## Feature Transformations

In [None]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer

tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")

"""
Converting the Target Variable to Numeric Labels
"""
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')



In [None]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')


### The Model

We'll use Naive Bayes, but feel free to play around with this choice!

In [None]:
# Use defaults
nb = NaiveBayes()

### Pipeline

In [None]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_num,tokenizer,stopremove,count_vec,idf,clean_up])

In [None]:
cleaner = data_prep_pipe.fit(data)

In [None]:
clean_data = cleaner.transform(data)
clean_data.head()

### Training and Evaluation!

In [None]:
clean_data = clean_data.select(['label','features'])

In [None]:
clean_data.show()

In [None]:
(training,testing) = clean_data.randomSplit([0.7,0.3])

In [None]:
spam_predictor = nb.fit(training)

In [None]:
data.printSchema()

In [None]:
test_results = spam_predictor.transform(testing)

In [None]:
test_results.show()

In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting spam was: {}".format(acc))