In [34]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
from nltk.stem import WordNetLemmatizer
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize SparkSession
spark = SparkSession.builder.appName("TextClassification").getOrCreate()


# Load your dataset into a DataFrame
df = spark.read.csv("testing-dataset.csv", header=True, inferSchema=True)

# Data Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words('english'))

# UDF for preprocessing
def preprocess_text(text):
    text = re.sub(r'""', '"', text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

preprocess_udf = udf(preprocess_text, StringType())
df = df.withColumn("title", preprocess_udf(df["title"]))
target_classes = [row.topic_name for row in df.select("topic_name").distinct().collect()]

df.groupBy('topic_name').count().show(truncate=False)


from pyspark.ml.feature import StringIndexer

# Convert the topic_name column to numeric
indexer = StringIndexer(inputCol="topic_name", outputCol="label")
indexed_df = indexer.fit(df).transform(df)
indexed_df = indexed_df.withColumn("label", col("label").cast("integer"))


# Split the dataset
(train_df, test_df) = indexed_df.randomSplit([0.8, 0.2], seed=42)

# TF-IDF Vectorization
tokenizer = Tokenizer(inputCol="title", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=1500)
idf = IDF(inputCol="raw_features", outputCol="features")

# Linear SVM model
from pyspark.ml.classification import RandomForestClassifier

# Random Forest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

# Modify the pipeline to use Random Forest
pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, rf])

# Hyperparameter Tuning (Randomized Search)
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5,
                          seed=42)

# Fit the model
cvModel = crossval.fit(train_df)

# Make predictions
predictions = cvModel.transform(test_df)

# Evaluation
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.4f}".format(accuracy))

# Classification Report
from sklearn.metrics import classification_report
y_true = predictions.select("label").rdd.flatMap(lambda x: x).collect()
y_pred = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()
report = classification_report(y_true, y_pred, target_names=target_classes, output_dict=True)

print("Classification Report for Testing Data:")
for topic, metrics in report.items():
    if topic == 'accuracy':
        print(f"Accuracy: {metrics:.4f}")
    else:
        print(f"Topic: {topic}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-Score: {metrics['f1-score']:.4f}")
        print(f"Support: {metrics['support']:.0f}")
        print()

# Stop Spark
spark.stop()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


+-------------+-----+
|topic_name   |count|
+-------------+-----+
|relationships|125  |
|money        |152  |
|food         |137  |
|job          |134  |
|life         |83   |
|music        |76   |
|media        |76   |
|movie        |90   |
|sexual       |149  |
|health       |140  |
|kid          |140  |
|would        |110  |
|game         |72   |
|book         |73   |
|tech         |112  |
+-------------+-----+



Accuracy: 0.5087
Classification Report for Testing Data:
Topic: relationships
Precision: 0.7647
Recall: 0.3514
F1-Score: 0.4815
Support: 37

Topic: money
Precision: 0.7143
Recall: 0.5556
F1-Score: 0.6250
Support: 27

Topic: food
Precision: 0.1053
Recall: 0.7143
F1-Score: 0.1835
Support: 14

Topic: job
Precision: 0.7600
Recall: 0.7600
F1-Score: 0.7600
Support: 25

Topic: life
Precision: 0.6667
Recall: 0.8235
F1-Score: 0.7368
Support: 17

Topic: music
Precision: 0.7778
Recall: 0.5600
F1-Score: 0.6512
Support: 25

Topic: media
Precision: 0.4286
Recall: 0.2857
F1-Score: 0.3429
Support: 21

Topic: movie
Precision: 0.7143
Recall: 0.2083
F1-Score: 0.3226
Support: 24

Topic: sexual
Precision: 0.6000
Recall: 0.1579
F1-Score: 0.2500
Support: 19

Topic: health
Precision: 0.8333
Recall: 0.3846
F1-Score: 0.5263
Support: 13

Topic: kid
Precision: 0.5600
Recall: 0.8750
F1-Score: 0.6829
Support: 16

Topic: would
Precision: 0.8750
Recall: 0.3333
F1-Score: 0.4828
Support: 21

Topic: game
Precision: 1.00