In [4]:
from pyspark.sql import SparkSession
from operator import add

spark_session = SparkSession.builder\
        .master("spark://192.168.2.156:7077") \
        .appName("RedditTextClassification_1")\
        .config("spark.executor.memory", "8g")\
        .config("spark.driver.memory", "4g")\
        .config("spark.executor.cores", 4)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.minExecutors", 2)\
        .config("spark.dynamicAllocation.maxExecutors", 8)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout", "120s")\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")


## Setting up Spark Session / Context

In [4]:
# from pyspark.sql import SparkSession
# from operator import add

# spark_session = SparkSession.builder\
#         .master("spark://192.168.2.156:7077") \
#         .appName("Project Group 32")\
#         .config("spark.dynamicAllocation.enabled", True)\
#         .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
#         .config("spark.shuffle.service.enabled", False)\
#         .config("spark.dynamicAllocation.executorIdleTimeout","120s")\
#         .config("spark.executor.cores", 2)\
#         .config("spark.driver.port",9999)\
#         .config("spark.blockManager.port",10005)\
#         .getOrCreate()

# # RDD API
# spark_context = spark_session.sparkContext
# spark_context.setLogLevel("ERROR")

## Loading the data from HDFS

In [None]:
# The same example, this time using map and reduce from the Spark API, and loading the text file from HDFS.

lines = spark_context.textFile("hdfs://192.168.2.156:9000/data/reddit/reddit_50k.json")
print(lines.first())

[Stage 0:>                                                          (0 + 0) / 1]

## Create a dataframe to analyse the posts line by line

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark Session
# spark = SparkSession.builder.appName("RedditJSONProcessing").getOrCreate()

# Load JSON file into a Spark DataFrame
df = spark_session.read.json("hdfs://192.168.2.156:9000/data/reddit/reddit_500k.json")

# Show schema to understand the structure
df.printSchema()

# Show first few rows to inspect data
df.show(5, truncate=False)


## Count Total Posts

In [8]:
print(f"Total Posts: {df.count()}")


25/03/09 14:29:24 ERROR TransportClient: Failed to send RPC RPC 8537826835214227631 to /192.168.2.203:48312: io.netty.channel.StacklessClosedChannelException
io.netty.channel.StacklessClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(Object, ChannelPromise)(Unknown Source)
25/03/09 14:29:24 ERROR TransportClient: Failed to send RPC RPC 8227967262488942120 to /192.168.2.237:39166: io.netty.channel.StacklessClosedChannelException
io.netty.channel.StacklessClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(Object, ChannelPromise)(Unknown Source)
25/03/09 14:29:24 ERROR TransportClient: Failed to send RPC RPC 8357084806247150639 to /192.168.2.178:44172: io.netty.channel.StacklessClosedChannelException
io.netty.channel.StacklessClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(Object, ChannelPromise)(Unknown Source)
25/03/09 14:29:24 ERROR TransportClient: Failed to send RPC RPC 7673876306390507360 t

Total Posts: 1000001


                                                                                

## How many Subreddits do exist?

In [9]:
unique_subreddits = df.select("subreddit").distinct().count()
print(f"Unique Subreddits: {unique_subreddits}")




Unique Subreddits: 9610


                                                                                

In [10]:
from pyspark.sql.functions import col, count

df.groupBy("subreddit").count().orderBy(col("count").desc()).show(10, False)


[Stage 19:>                                                         (0 + 1) / 1]

+-----------------+------+
|subreddit        |count |
+-----------------+------+
|NULL             |501789|
|AskReddit        |117305|
|leagueoflegends  |12088 |
|AdviceAnimals    |9413  |
|funny            |8578  |
|politics         |8005  |
|gaming           |7911  |
|pics             |7803  |
|atheism          |6766  |
|explainlikeimfive|5701  |
+-----------------+------+
only showing top 10 rows



                                                                                

-- We see that many post are not assigned to a Subreddit, since we want to train a Classification model, we delete the NULL post --

In [13]:
from pyspark.sql.functions import col

# Filter out rows where subreddit is NULL
df_filtered = df.filter(col("subreddit").isNotNull())

# Show first few rows after filtering
# df_filtered.show(5, truncate=False)

# Count remaining rows
print(f"Total Posts After Filtering: {df_filtered.count()}")




Total Posts After Filtering: 498212


                                                                                

## To prepare the Data for our ML Classification Model, we use the columns summary and content

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Filter out NULL subreddit, summary, or content
df_filtered = df.filter((col("subreddit").isNotNull()) & (col("summary").isNotNull()) & (col("content").isNotNull()))

# Show filtered data
df_filtered.select("subreddit", "summary", "content").show(5, truncate=False)


+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## We have to make the Text understandable for the algorithm

1. We first tokenize the the columns
2. Remove stop words, since they do not add information to the text
3. We convert the Text with TF-IDF to numbers

In [15]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler

# Tokenize summary and content
tokenizer = Tokenizer(inputCol="summary", outputCol="summary_tokens")
tokenizer2 = Tokenizer(inputCol="content", outputCol="content_tokens")

# Remove stopwords
stopwords_remover = StopWordsRemover(inputCol="summary_tokens", outputCol="summary_clean")
stopwords_remover2 = StopWordsRemover(inputCol="content_tokens", outputCol="content_clean")

# Convert words to numerical features using TF-IDF
hashing_tf = HashingTF(inputCol="summary_clean", outputCol="summary_tf", numFeatures=1000)
idf = IDF(inputCol="summary_tf", outputCol="summary_features")

hashing_tf2 = HashingTF(inputCol="content_clean", outputCol="content_tf", numFeatures=1000)
idf2 = IDF(inputCol="content_tf", outputCol="content_features")

# Convert subreddit (text label) into a numerical label
label_indexer = StringIndexer(inputCol="subreddit", outputCol="label")

# Combine summary and content features
feature_assembler = VectorAssembler(inputCols=["summary_features", "content_features"], outputCol="features")


In [17]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Split data into training and test sets
train_data, test_data = df_filtered.randomSplit([0.8, 0.2], seed=42)

# Define the classification model
classifier = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Create a pipeline to apply all transformations and train the model
pipeline = Pipeline(stages=[tokenizer, tokenizer2, stopwords_remover, stopwords_remover2,
                            hashing_tf, idf, hashing_tf2, idf2, label_indexer, feature_assembler, classifier])

# Train the model
model = pipeline.fit(train_data)

# Save the trained model
# model.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier")


25/03/09 14:47:55 ERROR TaskSchedulerImpl: Lost executor 40 on 192.168.2.60: Command exited with code 52
25/03/09 14:49:13 ERROR TaskSchedulerImpl: Lost executor 46 on 192.168.2.144: Command exited with code 52
25/03/09 14:49:15 ERROR TaskSchedulerImpl: Lost executor 38 on 192.168.2.237: Command exited with code 52
25/03/09 14:49:17 ERROR TaskSchedulerImpl: Lost executor 37 on 192.168.2.218: Command exited with code 52
25/03/09 14:49:18 ERROR TaskSchedulerImpl: Lost executor 39 on 192.168.2.57: Command exited with code 52
25/03/09 14:49:19 ERROR TaskSchedulerImpl: Lost executor 36 on 192.168.2.144: Command exited with code 52
25/03/09 14:49:23 ERROR TaskSchedulerImpl: Lost executor 35 on 192.168.2.11: Command exited with code 52
25/03/09 14:49:24 ERROR TaskSchedulerImpl: Lost executor 45 on 192.168.2.11: Command exited with code 52
25/03/09 14:49:25 ERROR TaskSchedulerImpl: Lost executor 43 on 192.168.2.131: Command exited with code 52
25/03/09 14:49:25 ERROR TaskSchedulerImpl: Lost ex

Py4JJavaError: An error occurred while calling o330.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 18 in stage 39.0 failed 4 times, most recent failure: Lost task 18.4 in stage 39.0 (TID 492) (192.168.2.60 executor 69): ExecutorLostFailure (executor 69 exited caused by one of the running tasks) Reason: Command exited with code 52
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2488)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1202)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1196)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$2(RDD.scala:1289)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1256)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1242)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1242)
	at org.apache.spark.ml.optim.loss.RDDLossFunction.calculate(RDDLossFunction.scala:61)
	at org.apache.spark.ml.optim.loss.RDDLossFunction.calculate(RDDLossFunction.scala:47)
	at breeze.optimize.CachedDiffFunction.calculate(CachedDiffFunction.scala:24)
	at breeze.optimize.FirstOrderMinimizer.calculateObjective(FirstOrderMinimizer.scala:53)
	at breeze.optimize.FirstOrderMinimizer.initialState(FirstOrderMinimizer.scala:47)
	at breeze.optimize.FirstOrderMinimizer.iterations(FirstOrderMinimizer.scala:99)
	at org.apache.spark.ml.classification.LogisticRegression.trainImpl(LogisticRegression.scala:1005)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:634)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:497)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:287)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate model accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Model Accuracy: {accuracy:.4f}")


25/03/09 14:50:56 ERROR TaskSchedulerImpl: Lost executor 52 on 192.168.2.218: Command exited with code 52
25/03/09 14:51:06 ERROR TaskSchedulerImpl: Lost executor 53 on 192.168.2.57: Command exited with code 52
25/03/09 14:51:07 ERROR TaskSchedulerImpl: Lost executor 56 on 192.168.2.11: Command exited with code 52
[Stage 39:>                                                       (0 + 24) / 34]

In [22]:
# release the cores for another application!
spark_context.stop()