![Alt text](../images/logistic-regression-tokenizer.png)

# Multinomial Logistic Regression

In this notebook, the MLR is trained and tested. Afterwards, [unseen unlabeled reddit messages](unseen-data.csv) is passed through the recently trained model obtaining a [labeled dataset](predictions-unseen.csv) which predicts so good so far. This is the model that will be used in production, running in a EC2 instance.

In [1]:
# Import libraries

# sparkML
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# sparkSQL
from pyspark.sql.functions import udf, col, when
from pyspark.sql.types import StringType, DoubleType
from pyspark.sql import SparkSession

# other
from nltk.stem import WordNetLemmatizer
import nltk
import csv

In [2]:
# Download nltk functions --> Necessary to remove stopwords and more to get better accuracy in our NLP ML model
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize SparkSession
spark = SparkSession.builder.appName("TextClassification").getOrCreate()

In [36]:
# UDF for preprocessing
def preprocess_text(text):
    '''Remove stop words, tokenize and clean the data from the title column. Original: |Cis Men of Reddit, if you were a woman, what would you like about Men?  --> PROCESSED: men reddit woman would like men''' 
    tokens = nltk.word_tokenize(text.lower(), language="english")
    tokens = [word for word in tokens if word.isalnum() and len(word) > 1]
    tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [37]:

# Load the labeled dataset into spark
df = spark.read.csv("../labeled-dataset/labeled-training-dataset.csv", header=True, inferSchema=True)

# We need a copy of the column, as we are going to tokenize and vektorize the title to better classification
df = df.withColumn("original_title", df["title"])

# Data Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words("english"))

# Create a list of unique values of the topic_name column in the csv file. Eventually, you will find some rows that contain a really rare character combination, so you avoid it creating specific previous topics.
TARGET_CLASSES = ["money", "food", "job", "life", "music", "media", "movie", "sexual", "health", "kid", "game", "book", "tech", "relationships"]

# Filter out the elements that are not falling in any of the classes due to a SyntaxError in the Reddit Sentence --> "Is calling someone a ""plaything"" a porn term? If so, what type of porn/ kink content?",sexual [Look at that combination of double ""]
df = df.filter(df["topic_name"].isin(TARGET_CLASSES))

preprocess_udf = udf(preprocess_text, StringType())
df = df.withColumn("title", preprocess_udf(df["title"]))

# Convert the topic_name column to numeric
indexer = StringIndexer(inputCol="topic_name", outputCol="label")
indexed_df = indexer.fit(df).transform(df)
indexed_df = indexed_df.withColumn("label", col("label").cast("integer"))

# Split the dataset
(train_df, test_df) = indexed_df.randomSplit([0.8, 0.2], seed=42)

# TF-IDF Vectorization --> Classic steps prior to train a Multiclassification model in NLP
tokenizer = Tokenizer(inputCol="title", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=1500)
idf = IDF(inputCol="raw_features", outputCol="features")

# Logistic Regression model
lr = LogisticRegression(labelCol="label", featuresCol="features")

# Modify the pipeline to use Logistic Regression
pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lr])

# Hyperparameter Tuning for Logistic Regression
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]).addGrid(lr.elasticNetParam, [0.0, 0.1, 0.5]).build()

evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")

crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5, seed=42)

In [38]:
# Fit the model
cvModel = crossval.fit(train_df)

# Make predictions
predictions = cvModel.transform(test_df)

In [39]:
# Model Evaluation
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.4f}".format(accuracy))

# Classification Report
from sklearn.metrics import classification_report
y_true = predictions.select("label").rdd.flatMap(lambda x: x).collect()
y_pred = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()
report = classification_report(y_true, y_pred, target_names=TARGET_CLASSES, output_dict=True)

print("Classification Report for Testing Data:")
for topic, metrics in report.items():
    if topic == 'accuracy':
        print(f"Accuracy: {metrics:.4f}")
    else:
        print(f"Topic: {topic}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-Score: {metrics['f1-score']:.4f}")
        print(f"Support: {metrics['support']:.0f}")
        print()

Accuracy: 0.6544
Classification Report for Testing Data:
Topic: money
Precision: 0.4828
Recall: 0.5185
F1-Score: 0.5000
Support: 27

Topic: food
Precision: 0.6774
Recall: 0.7241
F1-Score: 0.7000
Support: 29

Topic: job
Precision: 0.3696
Recall: 0.6538
F1-Score: 0.4722
Support: 26

Topic: life
Precision: 0.8571
Recall: 0.7500
F1-Score: 0.8000
Support: 24

Topic: music
Precision: 0.7368
Recall: 0.5385
F1-Score: 0.6222
Support: 26

Topic: media
Precision: 0.7619
Recall: 0.6957
F1-Score: 0.7273
Support: 23

Topic: movie
Precision: 0.4000
Recall: 0.4667
F1-Score: 0.4308
Support: 30

Topic: sexual
Precision: 0.7222
Recall: 0.5909
F1-Score: 0.6500
Support: 22

Topic: health
Precision: 0.8750
Recall: 0.8750
F1-Score: 0.8750
Support: 24

Topic: kid
Precision: 0.9500
Recall: 0.7600
F1-Score: 0.8444
Support: 25

Topic: game
Precision: 0.9444
Recall: 0.8095
F1-Score: 0.8718
Support: 21

Topic: book
Precision: 0.5000
Recall: 0.2941
F1-Score: 0.3704
Support: 17

Topic: tech
Precision: 0.8125
Recall:

In [40]:
unseen_df = spark.read.csv("unseen-data.csv", header=True, inferSchema=True)
unseen_df_title = unseen_df.select("title")
unseen_df_title_original = unseen_df_title.withColumn("original_title", unseen_df_title["title"])

preprocess_udf = udf(preprocess_text, StringType())
unseen_df_title_original_processed = unseen_df_title_original.withColumn("title", preprocess_udf(unseen_df_title_original["title"]))

unseen_df_title_original_processed.show(truncate=False)

+---------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title                                                                                  |original_title                                                                                                                                                    |
+---------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|week inclusive affect weight                                                           |How did a week all inclusive affect your weight?                                                                                                        

In [41]:
predictions_unseen = cvModel.transform(unseen_df_title_original_processed)

In [42]:
# Define a UDF to calculate the maximum probability
def max_probability(prob_list):
    return float(max(prob_list))

# Register the UDF
max_prob_udf = udf(max_probability, DoubleType())

# Add the max_prob column using the UDF
predictions_unseen = predictions_unseen.withColumn("max_prob", max_prob_udf(col("probability")))

label_to_class = {
    0: "money",
    1: "sexual",
    2: "health",
    3: "kid",
    4: "job",
    5: "movies",
    6: "relationships",
    7: "food",
    8: "videogame",
    9: "media",
    10: "music",
    11: "tech",
    12: "book",
    13: "life"
}

def map_label_to_class(label):
    if label != "other":
        return label_to_class.get(float(label))
    else:
        return "other"

# Register the UDF
map_label_udf = udf(map_label_to_class, StringType())

predictions_unseen = predictions_unseen.withColumn(
    "predicted_category",
    when(predictions_unseen["max_prob"] >= 0.3, predictions_unseen["prediction"]).otherwise("other")
)

predictions_unseen = predictions_unseen.withColumn(
    "predicted_category",
    map_label_udf(predictions_unseen["predicted_category"])
)


# Select the desired columns
selected_columns = predictions_unseen.select("original_title", "max_prob", "predicted_category")

# Collect the DataFrame into the driver program
collected_data = selected_columns.collect()


with open("predictions-unseen.csv", "w", newline="", encoding='UTF-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    # Write header
    csv_writer.writerow(["original_title", "max_prob", "predicted_category"])
    # Write data
    for row in collected_data:
        csv_writer.writerow([row.original_title, row.max_prob, row.predicted_category])