###Initializing a SparkSession

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NLP').getOrCreate()

###Loading the dataset

In [4]:
df = spark.read.csv('/FileStore/tables/SMSSpamCollection', inferSchema=True, sep='\t')
df = df.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
df.show()

###Feature engineering

In [6]:
from pyspark.sql.functions import length
df = df.withColumn('length', length(df['text']))
df.show()

In [7]:
df.groupBy('class').mean().show()

doc 1 'i, like, apples, i'

tf-idf

i = 2 -> 2 * log(2/2) -> 0

like = 1 -> 1 * log(2/1) -> 0.3 ('like' is a relatively relevent string in this doc)

apples = 1 -> 1 * log(2/2) -> 0

hate = 0 -> 0 * log(2/1) -> 0

doc 2 'i, hate, apples'

i = 1 -> 1 * log(2/2) -> 0

like = 0 -> 0 * log(2/1) -> 0

apples = 1 -> 1 * log(2/2) -> 0

hate = 1 -> 1 * log(2/1) -> 0.3 ('hate' is a relatively relevent string in this doc)

idf

i = 2

like = 1

apples = 2

hate = 1

N = 2

###Formatting the text columng

In [10]:
from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, IDF, VectorAssembler
ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label')
# df_1 = ham_spam_to_numeric.fit(df).transform(df)
# df_1.show()
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
# df_2 = tokenizer.transform(df_1)
# df_2.show()
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='token_stop')
# df_3 = stop_remove.transform(df_2)
# df_3.show()
count_vec = CountVectorizer(inputCol='token_stop', outputCol='count_vec')
# df_4 = count_vec.fit(df_3).transform(df_3)
# df_4.show()
idf = IDF(inputCol='count_vec', outputCol='tf-idf')
# df_5 = idf.fit(df_4).transform(df_4)
# df_5.show()
transformed_df = VectorAssembler(inputCols=['length', 'tf-idf'], outputCol='features')
# df_6 = transformed_df.transform(df_5)
# df_6.show()

In [11]:
from pyspark.ml import Pipeline
df_pipe = Pipeline(stages=[ham_spam_to_numeric,
                           tokenizer,
                           stop_remove,
                           count_vec,
                           idf,
                           transformed_df])
final_df = df_pipe.fit(df).transform(df).select('label', 'features')
final_df.show()

###Splitting the dataset

In [13]:
train_data, test_data = final_df.randomSplit([0.7, 0.3])

###Building the ML model

In [15]:
from pyspark.ml.classification import NaiveBayes
classifier = NaiveBayes(featuresCol='features', labelCol='label', predictionCol='prediction')
fittied_classifer = classifier.fit(train_data)

###Predicting the test data

In [17]:
preds = fittied_classifer.transform(test_data)
preds.show()

###Evaluating the model

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label')
area_under_curve = evaluator.evaluate(preds)
accuracy = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='label')
accuracy = accuracy.evaluate(preds)
print(area_under_curve)
print(accuracy)