## Setting up Spark Session / Context

In [1]:
from pyspark.sql import SparkSession
from operator import add

spark_session = SparkSession.builder\
        .master("spark://192.168.2.156:7077") \
        .appName("Project Group 32")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","220s")\
        .config("spark.executor.cores", 4)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/09 21:47:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Loading the data from HDFS

In [2]:
# The same example, this time using map and reduce from the Spark API, and loading the text file from HDFS.

lines = spark_context.textFile("hdfs://192.168.2.156:9000/data/reddit/reddit_50k.json")
print(lines.first())

[Stage 0:>                                                          (0 + 1) / 1]

��{ " a u t h o r " : " r a y s o f d a r k m a t t e r " , " b o d y " : " I   t h i n k   i t   s h o u l d   b e   f i x e d   o n   e i t h e r   U T C   s t a n d a r d   o r   U T C + 1   y e a r   a r o u n d ,   w i t h   t h e   c u r r e n t   z o n e   o f f s e t s . \ n \ n M o v i n g   t i m e s c a l e s   a d d   a   l o t   o f   c o m p l e x i t y   t o   t h e   i m p l e m e n t a t i o n   o f   t i m e k e e p i n g   s y s t e m s   a n d   h a v e   [ d u b i o u s   v a l u e ] (   \ n \ n I   t h i n k   s e a s o n a l   s h i f t i n g   t i m e   m a d e   s e n s e   i n   t h e   p r e - e l e c t r i c   p a s t ,   w h e n   t i m e k e e p i n g   w a s   m o r e   f l e x i b l e   a n d   a r t i f i c i a l   l i g h t   w a s   i n e f f i c i e n t   a n d   o f t e n   d a n g e r o u s .   \ n \ n N o w   w e   h a v e   m a c h i n e s   t h a t   w o r k   e a s i l y   w i t h   s i m p l e   t i m e k e e p i n g   r u l e s ,   a n d   i 

                                                                                

## Create a dataframe to analyse the posts line by line

In [3]:
from pyspark.sql import SparkSession

# Initialize Spark Session
# spark = SparkSession.builder.appName("RedditJSONProcessing").getOrCreate()

# Load JSON file into a Spark DataFrame
df = spark_session.read.json("hdfs://192.168.2.156:9000/data/reddit/reddit_500k.json")

# Show schema to understand the structure
df.printSchema()

# Show first few rows to inspect data
# df.show(5, truncate=False)


                                                                                

root
 |-- _corrupt_record: string (nullable = true)
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)



## Count Total Posts

In [4]:
print(f"Total Posts: {df.count()}")




Total Posts: 1000001


                                                                                

## How many Subreddits do exist?

In [5]:
unique_subreddits = df.select("subreddit").distinct().count()
print(f"Unique Subreddits: {unique_subreddits}")


[Stage 7:>                                                          (0 + 1) / 1]

Unique Subreddits: 9610


                                                                                

In [6]:
from pyspark.sql.functions import col, count

df.groupBy("subreddit").count().orderBy(col("count").desc()).show(10, False)




+-----------------+------+
|subreddit        |count |
+-----------------+------+
|NULL             |501789|
|AskReddit        |117305|
|leagueoflegends  |12088 |
|AdviceAnimals    |9413  |
|funny            |8578  |
|politics         |8005  |
|gaming           |7911  |
|pics             |7803  |
|atheism          |6766  |
|explainlikeimfive|5701  |
+-----------------+------+
only showing top 10 rows



                                                                                

-- We see that many post are not assigned to a Subreddit, since we want to train a Classification model, we delete the NULL post --

In [7]:
from pyspark.sql.functions import col

# Filter out rows where subreddit is NULL
df_filtered = df.filter(col("subreddit").isNotNull())

# Show first few rows after filtering
# df_filtered.show(5, truncate=False)

# Count remaining rows
print(f"Total Posts After Filtering: {df_filtered.count()}")




Total Posts After Filtering: 498212


                                                                                

## To prepare the Data for our ML Classification Model, we use the columns summary and content

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Filter out NULL subreddit, summary, or content
df_filtered = df.filter((col("subreddit").isNotNull()) & (col("summary").isNotNull()) & (col("content").isNotNull()))

# Show filtered data
df_filtered.select("subreddit", "summary", "content") # .show(5, truncate=False)


DataFrame[subreddit: string, summary: string, content: string]

## We have to make the Text understandable for the algorithm

1. We first tokenize the the columns
2. Remove stop words, since they do not add information to the text
3. We convert the Text with TF-IDF to numbers

In [9]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler

# Tokenize summary and content
tokenizer = Tokenizer(inputCol="summary", outputCol="summary_tokens")
tokenizer2 = Tokenizer(inputCol="content", outputCol="content_tokens")

# Remove stopwords
stopwords_remover = StopWordsRemover(inputCol="summary_tokens", outputCol="summary_clean")
stopwords_remover2 = StopWordsRemover(inputCol="content_tokens", outputCol="content_clean")

# Convert words to numerical features using TF-IDF
hashing_tf = HashingTF(inputCol="summary_clean", outputCol="summary_tf", numFeatures=1000)
idf = IDF(inputCol="summary_tf", outputCol="summary_features")

hashing_tf2 = HashingTF(inputCol="content_clean", outputCol="content_tf", numFeatures=1000)
idf2 = IDF(inputCol="content_tf", outputCol="content_features")

# Convert subreddit (text label) into a numerical label
label_indexer = StringIndexer(inputCol="subreddit", outputCol="label")

# Combine summary and content features
feature_assembler = VectorAssembler(inputCols=["summary_features", "content_features"], outputCol="features")


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate model accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Model Accuracy: {accuracy:.4f}")


In [11]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

train_data, test_data = df_filtered.sample(fraction=0.02, seed=42).randomSplit([0.8, 0.2])

# Define the Random Forest classifier
rf_classifier = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)

# Create a new pipeline using Random Forest instead of Logistic Regression
pipeline_rf = Pipeline(stages=[tokenizer, tokenizer2, stopwords_remover, stopwords_remover2,
                               hashing_tf, idf, hashing_tf2, idf2, label_indexer, feature_assembler, rf_classifier])

# Train the model
model_rf = pipeline_rf.fit(train_data)

# Save the trained model
model_rf.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier_rf")


25/03/09 21:51:11 ERROR TaskSchedulerImpl: Lost executor 1 on 192.168.2.134: Command exited with code 52
25/03/09 21:51:11 ERROR TaskSchedulerImpl: Lost executor 3 on 192.168.2.178: Command exited with code 52
25/03/09 21:51:12 ERROR TaskSchedulerImpl: Lost executor 0 on 192.168.2.144: Command exited with code 52
25/03/09 21:51:13 ERROR TaskSchedulerImpl: Lost executor 2 on 192.168.2.131: Command exited with code 52
25/03/09 21:51:13 ERROR TaskSchedulerImpl: Lost executor 5 on 192.168.2.144: Command exited with code 52
25/03/09 21:51:18 ERROR TaskSchedulerImpl: Lost executor 6 on 192.168.2.237: Command exited with code 52
25/03/09 21:51:32 ERROR TaskSchedulerImpl: Lost executor 7 on 192.168.2.57: Command exited with code 52
25/03/09 21:51:32 ERROR TaskSchedulerImpl: Lost executor 8 on 192.168.2.134: Command exited with code 52
25/03/09 21:51:32 ERROR TaskSchedulerImpl: Lost executor 9 on 192.168.2.131: Command exited with code 52
25/03/09 21:51:32 ERROR TaskSchedulerImpl: Lost executor

Py4JJavaError: An error occurred while calling o116.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 28.0 failed 4 times, most recent failure: Lost task 9.4 in stage 28.0 (TID 427) (192.168.2.144 executor 12): ExecutorLostFailure (executor 12 exited caused by one of the running tasks) Reason: Command exited with code 52
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:738)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:737)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:663)
	at org.apache.spark.ml.tree.impl.RandomForest$.runBagged(RandomForest.scala:208)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:302)
	at org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:168)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:139)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:47)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [11]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Split data into training and test sets
# train_data, test_data = df_filtered.randomSplit([0.8, 0.2], seed=42)
train_data, test_data = df_filtered.sample(fraction=0.02, seed=42).randomSplit([0.8, 0.2])


# Define the classification model
classifier = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Create a pipeline to apply all transformations and train the model
pipeline = Pipeline(stages=[tokenizer, tokenizer2, stopwords_remover, stopwords_remover2,
                            hashing_tf, idf, hashing_tf2, idf2, label_indexer, feature_assembler, classifier])

# Train the model
model = pipeline.fit(train_data)

# Save the trained model
# model.save("hdfs://192.168.2.156:9000/data/reddit/model/reddit_text_classifier")


25/03/09 21:39:21 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/03/09 21:39:21 ERROR Instrumentation: org.apache.spark.SparkException: Job aborted due to stage failure: Master removed our application: KILLED
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.D

Py4JJavaError: An error occurred while calling o383.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Master removed our application: KILLED
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2488)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1202)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1196)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$2(RDD.scala:1289)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1256)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1242)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1242)
	at org.apache.spark.ml.optim.loss.RDDLossFunction.calculate(RDDLossFunction.scala:61)
	at org.apache.spark.ml.optim.loss.RDDLossFunction.calculate(RDDLossFunction.scala:47)
	at breeze.optimize.CachedDiffFunction.calculate(CachedDiffFunction.scala:24)
	at breeze.optimize.LineSearch$$anon$1.calculate(LineSearch.scala:52)
	at breeze.optimize.LineSearch$$anon$1.calculate(LineSearch.scala:31)
	at breeze.optimize.StrongWolfeLineSearch.phi$1(StrongWolfe.scala:76)
	at breeze.optimize.StrongWolfeLineSearch.$anonfun$minimizeWithBound$7(StrongWolfe.scala:152)
	at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:158)
	at breeze.optimize.StrongWolfeLineSearch.minimizeWithBound(StrongWolfe.scala:151)
	at breeze.optimize.StrongWolfeLineSearch.minimize(StrongWolfe.scala:62)
	at breeze.optimize.LBFGS.determineStepSize(LBFGS.scala:82)
	at breeze.optimize.LBFGS.determineStepSize(LBFGS.scala:38)
	at breeze.optimize.FirstOrderMinimizer.$anonfun$infiniteIterations$1(FirstOrderMinimizer.scala:63)
	at scala.collection.Iterator$$anon$7.next(Iterator.scala:140)
	at breeze.util.IteratorImplicits$RichIterator$$anon$2.next(Implicits.scala:79)
	at org.apache.spark.ml.classification.LogisticRegression.trainImpl(LogisticRegression.scala:1015)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:634)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:497)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:287)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
# release the cores for another application!
spark_context.stop()