![Alt text](image.png)


In [7]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
from nltk.stem import WordNetLemmatizer
import nltk
import re 

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Initialize SparkSession
spark = SparkSession.builder.appName("TextClassification").getOrCreate()

# UDF for preprocessing
def preprocess_text(text):
    '''Remove stop words, tokenize and clean the data from the title column. Original: |Cis Men of Reddit, if you were a woman, what would you like about Men?  --> PROCESSED: men reddit woman would like men''' 
    text = re.sub(r'""', '"', text)
    tokens = nltk.word_tokenize(text.lower(), language="english")
    tokens = [word for word in tokens if word.isalnum() and len(word) > 1]
    tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Load the labeled dataset into spark
df = spark.read.csv("labeled-training-dataset.csv", header=True, inferSchema=True)

# We need a copy of the column, as we are going to tokenize and vektorize the title to better classification
df = df.withColumn("original_title", df["title"])

# Data Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words("english"))

# Create a list of unique values of the topic_name column in the csv file. Eventually, you will find some rows that contain a really rare character combination, so you avoid it creating specific previous topics.
TARGET_CLASSES = ["money", "food", "job", "life", "music", "media", "movie", "sexual", "health", "kid", "game", "book", "tech", "relationships"]

# Filter out the elements that are not falling in any of the classes due to a SyntaxError in the Reddit Sentence --> "Is calling someone a ""plaything"" a porn term? If so, what type of porn/ kink content?",sexual [Look at that combination of double ""]
df = df.filter(df["topic_name"].isin(TARGET_CLASSES))
df.groupby("topic_name").count().show()

preprocess_udf = udf(preprocess_text, StringType())
df = df.withColumn("title", preprocess_udf(df["title"]))

# Convert the topic_name column to numeric
indexer = StringIndexer(inputCol="topic_name", outputCol="label")
indexed_df = indexer.fit(df).transform(df)
indexed_df = indexed_df.withColumn("label", col("label").cast("integer"))

# Split the dataset
(train_df, test_df) = indexed_df.randomSplit([0.8, 0.2], seed=42)

# TF-IDF Vectorization --> Classic steps prior to train a Multiclassification model in NLP
tokenizer = Tokenizer(inputCol="title", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=1500)
idf = IDF(inputCol="raw_features", outputCol="features")

# Logistic Regression model
lr = LogisticRegression(labelCol="label", featuresCol="features")

# Modify the pipeline to use Logistic Regression
pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lr])


# Hyperparameter Tuning for Logistic Regression
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]).addGrid(lr.elasticNetParam, [0.0, 0.1, 0.5]).build()

evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5,
                          seed=42)

# Fit the model
cvModel = crossval.fit(train_df)

# Make predictions
predictions = cvModel.transform(test_df)

# Evaluation
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.4f}".format(accuracy))

# Classification Report
from sklearn.metrics import classification_report
y_true = predictions.select("label").rdd.flatMap(lambda x: x).collect()
y_pred = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()
report = classification_report(y_true, y_pred, target_names=TARGET_CLASSES, output_dict=True)



print("Classification Report for Testing Data:")
for topic, metrics in report.items():
    if topic == 'accuracy':
        print(f"Accuracy: {metrics:.4f}")
    else:
        print(f"Topic: {topic}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-Score: {metrics['f1-score']:.4f}")
        print(f"Support: {metrics['support']:.0f}")
        print()


# Create a DataFrame with original title, real label, and predicted label
output_df = predictions.select("original_title", "label", "prediction")


# Rename columns for clarity
output_df = output_df.withColumnRenamed("label", "real_label").withColumnRenamed("prediction", "predicted_label")
output_df.show(500, truncate=False)
# Save the DataFrame to a CSV file
failed_df = output_df.where(col('real_label') != col('predicted_label'))
failed_df.show(300, truncate=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


+-------------+-----+
|   topic_name|count|
+-------------+-----+
|relationships|  140|
|        money|  155|
|         food|  139|
|          job|  142|
|         life|  112|
|        music|  118|
|        media|  122|
|        movie|  140|
|       sexual|  152|
|       health|  145|
|          kid|  143|
|         game|  126|
|         book|  113|
|         tech|  116|
+-------------+-----+

Accuracy: 0.6544
Classification Report for Testing Data:
Topic: money
Precision: 0.4828
Recall: 0.5185
F1-Score: 0.5000
Support: 27

Topic: food
Precision: 0.6774
Recall: 0.7241
F1-Score: 0.7000
Support: 29

Topic: job
Precision: 0.3696
Recall: 0.6538
F1-Score: 0.4722
Support: 26

Topic: life
Precision: 0.8571
Recall: 0.7500
F1-Score: 0.8000
Support: 24

Topic: music
Precision: 0.7368
Recall: 0.5385
F1-Score: 0.6222
Support: 26

Topic: media
Precision: 0.7619
Recall: 0.6957
F1-Score: 0.7273
Support: 23

Topic: movie
Precision: 0.4000
Recall: 0.4667
F1-Score: 0.4308
Support: 30

Topic: sexual
Prec

In [6]:
print(failed_df.count())
print(output_df.count())

116
327
