In [2]:
#### Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

from pyspark.ml import Pipeline

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
    .appName("SMS spam filtering with Multilayer Perceptron Classifier") \
    .enableHiveSupport() \
    .getOrCreate()

In [5]:
raw_dataset = spark.read.table("public_datasets.sms_spam_collection")

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used


In [6]:
# StringIndexer to convert string labels to numerical labels
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")

In [7]:
# Tokenize the SMS text
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [8]:
# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

In [9]:
# Hashing TF to convert words to numerical features
hashingTF = HashingTF(inputCol="filtered_words", outputCol="numerical_features", numFeatures=5000)

In [10]:
# TF-IDF
idf = IDF(inputCol="numerical_features", outputCol="features")

In [11]:
prep_pipeline = Pipeline(stages=[label_indexer, tokenizer, remover, hashingTF, idf])

processed_dataset = prep_pipeline.fit(raw_dataset).transform(raw_dataset)

23/10/13 15:08:27 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/10/13 15:08:42 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


KeyboardInterrupt: 

In [None]:
dataset = processed_dataset.select("label_index", "features")

In [None]:
dataset.printSchema()

In [None]:
dataset.show(5)

In [None]:
# Split the dataset into training and testing sets
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Create an MLP classifier
layers = [5000, 100, 50, 2]  # Input: 5000 features, two hidden layers, output: binary (spam or ham)
mlp_classifier = MultilayerPerceptronClassifier(
    labelCol="label_index",
    featuresCol="features",
    layers=layers,
    blockSize=128,
    seed=42)

pipeline = Pipeline(stages=[mlp_classifier])

# Train the MLP classifier
mlp_model = pipeline.fit(trainingData)

In [None]:
# Make predictions on the test set
predictions = mlp_model.transform(testData)

# Evaluate the classifier
evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="prediction",
    metricName="accuracy")

accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy}")

# Stop the Spark session
spark.stop()


In [None]:
# Convert the PySpark DataFrame to a Pandas DataFrame for confusion matrix
predictions_pd = predictions.select("label_index", "prediction").toPandas()

# Compute the confusion matrix
confusion = confusion_matrix(predictions_pd["label_index"], predictions_pd["prediction"])

# Visualize the confusion matrix
def plot_confusion_matrix(cm, classes, normalize=False, title="Confusion Matrix", cmap=plt.cm.Blues):
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix, without normalization")

    print(cm)

    thresh = cm.max() / 2.0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

class_names = ["ham", "spam"]
plot_confusion_matrix(confusion, classes=class_names, title="Confusion Matrix")

plt.show()


23/10/13 15:08:57 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
