In [1]:
# Import libraries

# sparkML
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline

# sparkSQL
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession

# other
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
import nltk

In [2]:
# Download nltk functions --> Necessary to remove stopwords and more to get better accuracy in our NLP ML model
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize SparkSession
spark = SparkSession.builder.appName("TextClassification").getOrCreate()

In [4]:
# functions
def preprocess_text(text):
    '''Remove stop words, tokenize and clean the data from the title column. Original: |Cis Men of Reddit, if you were a woman, what would you like about Men?  --> PROCESSED: men reddit woman would like men''' 
    tokens = nltk.word_tokenize(text.lower(), language="english")
    tokens = [word for word in tokens if word.isalnum() and len(word) > 1]
    tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [5]:
# Load the labeled dataset into spark
df = spark.read.csv("labeled-training-dataset.csv", header=True, inferSchema=True)

# We need a copy of the column, as we are going to tokenize and vektorize the title to better classification
df = df.withColumn("original_title", df["title"])

# Data Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words("english"))

# Create a list of unique values of the topic_name column in the csv file. Eventually, you will find some rows that contain a really rare character combination, so you avoid it creating specific previous topics.
TARGET_CLASSES = ["money", "food", "job", "life", "music", "media", "movie", "sexual", "health", "kid", "game", "book", "tech", "relationships"]

# Filter out the elements that are not falling in any of the classes due to a SyntaxError in the Reddit Sentence --> "Is calling someone a ""plaything"" a porn term? If so, what type of porn/ kink content?",sexual [Look at that combination of double ""]
df = df.filter(df["topic_name"].isin(TARGET_CLASSES))

# Call the preprocess_text function to tokenize and lematize the title of the reddit post
preprocess_udf = udf(preprocess_text, StringType())
df = df.withColumn("title", preprocess_udf(df["title"]))

# Convert the topic_name column to numeric
indexer = StringIndexer(inputCol="topic_name", outputCol="label")
indexed_df = indexer.fit(df).transform(df)
indexed_df = indexed_df.withColumn("label", col("label").cast("integer"))

# Split the dataset
(train_df, test_df) = indexed_df.randomSplit([0.8, 0.2], seed=42)

# TF-IDF Vectorization --> Classic steps prior to train a Multiclassification model in NLP
tokenizer = Tokenizer(inputCol="title", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=1500)
idf = IDF(inputCol="raw_features", outputCol="features")

# Naive Bayes model
nb = NaiveBayes(labelCol="label", featuresCol="features", smoothing=1.0, modelType="multinomial")

# Modify the pipeline to use Naive Bayes
pipeline_nb = Pipeline(stages=[tokenizer, remover, cv, idf, nb])

evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")

+-------------+-----+
|   topic_name|count|
+-------------+-----+
|relationships|  140|
|        money|  155|
|         food|  139|
|          job|  142|
|         life|  112|
|        music|  118|
|        media|  122|
|        movie|  140|
|       sexual|  152|
|       health|  145|
|          kid|  143|
|         game|  126|
|         book|  113|
|         tech|  116|
+-------------+-----+



In [6]:
# Fit the model
nb_model = pipeline_nb.fit(train_df)

# Make predictions
predictions = nb_model.transform(test_df)

In [7]:
# Make predictions
predictions = nb_model.transform(test_df)

# Evaluation
accuracy_nb = evaluator.evaluate(predictions)
print("Naive Bayes Accuracy: {:.4f}".format(accuracy_nb))

# Classification Report
y_true = predictions.select("label").rdd.flatMap(lambda x: x).collect()
y_pred = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()
report = classification_report(y_true, y_pred, target_names=TARGET_CLASSES, output_dict=True)

print("Classification Report for Testing Data:")
for topic, metrics in report.items():
    if topic == 'accuracy':
        print(f"Accuracy: {metrics:.4f}")
    else:
        print(f"Topic: {topic}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-Score: {metrics['f1-score']:.4f}")
        print(f"Support: {metrics['support']:.0f}")
        print()

Naive Bayes Accuracy: 0.6024
Classification Report for Testing Data:
Topic: money
Precision: 0.4375
Recall: 0.5185
F1-Score: 0.4746
Support: 27

Topic: food
Precision: 0.6176
Recall: 0.7241
F1-Score: 0.6667
Support: 29

Topic: job
Precision: 0.6087
Recall: 0.5385
F1-Score: 0.5714
Support: 26

Topic: life
Precision: 0.6250
Recall: 0.6250
F1-Score: 0.6250
Support: 24

Topic: music
Precision: 0.6500
Recall: 0.5000
F1-Score: 0.5652
Support: 26

Topic: media
Precision: 0.7222
Recall: 0.5652
F1-Score: 0.6341
Support: 23

Topic: movie
Precision: 0.4333
Recall: 0.4333
F1-Score: 0.4333
Support: 30

Topic: sexual
Precision: 0.6471
Recall: 0.5000
F1-Score: 0.5641
Support: 22

Topic: health
Precision: 0.8462
Recall: 0.9167
F1-Score: 0.8800
Support: 24

Topic: kid
Precision: 0.7200
Recall: 0.7200
F1-Score: 0.7200
Support: 25

Topic: game
Precision: 0.8095
Recall: 0.8095
F1-Score: 0.8095
Support: 21

Topic: book
Precision: 0.3077
Recall: 0.2353
F1-Score: 0.2667
Support: 17

Topic: tech
Precision: 0.

In [8]:
spark.stop()