In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
)
from pyspark.sql.functions import (
    col,
    array_join,
    udf,
)
from sparknlp.base import DocumentAssembler

from sparknlp.annotator import Tokenizer
from nltk.corpus import stopwords
from sparknlp.annotator import (
    StopWordsCleaner,
    PerceptronModel,
    Chunker,
    LemmatizerModel,
    Normalizer,
)
from sparknlp.base import Finisher
from pyspark.ml import Pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [4]:
input_dir="data/raw/*.parquet"
output_dir="data/sentiment"

In [5]:
spark = (
    SparkSession.builder.appName("Reddit sentiment Processing")
    .master("local[*]")  # Use local[*] master
    .config(
        "spark.jars.packages",
        "com.johnsnowlabs.nlp:spark-nlp-spark32_2.12:3.4.2",
    )
    .getOrCreate()
)

24/06/11 18:44:37 WARN Utils: Your hostname, Exporo-MBP-219.local resolves to a loopback address: 127.0.0.1; using 10.10.5.83 instead (on interface en0)
24/06/11 18:44:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/johnomole/anaconda3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/johnomole/.ivy2/cache
The jars for the packages stored in: /Users/johnomole/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
com.johnsnowlabs.nlp#spark-nlp-spark32_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-45065e8d-9e3b-408e-9ef3-99379a5b0815;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.1 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found com.johnsnowlabs.nlp#spark-nlp-spark32_2.12;3.4.2 in central
	found com.typesafe#config;1.4.1 in central
	found

In [6]:
df = spark.read.parquet(input_dir)

                                                                                

In [7]:
df.show(10)

                                                                                

+-------+------------------+-------------+-----+----------+---------+--------------------+--------------------+-------------------+
|     id|            author|  created_utc|score| parent_id|subreddit|           permalink|                text|          timestamp|
+-------+------------------+-------------+-----+----------+---------+--------------------+--------------------+-------------------+
|l70kegt|5chrodingers_pussy|1.717472683E9|    2|t1_l70ii9i| JoeRogan|/r/JoeRogan/comme...|You bring up a co...|2024-06-04 05:44:43|
|l70kegt|5chrodingers_pussy|1.717472683E9|    2|t1_l70ii9i| JoeRogan|/r/JoeRogan/comme...|You bring up a co...|2024-06-04 05:44:43|
|l70kegt|5chrodingers_pussy|1.717472683E9|    2|t1_l70ii9i| JoeRogan|/r/JoeRogan/comme...|You bring up a co...|2024-06-04 05:44:43|
|l6z1s7d|     Sardoodledome|1.717450311E9|    1|t1_l6z1cw9| JoeRogan|/r/JoeRogan/comme...|Well I forgot abo...|2024-06-03 23:31:51|
|l6z1s7d|     Sardoodledome|1.717450311E9|    1|t1_l6z1cw9| JoeRogan|/r/JoeR

In [8]:
# prepare into spark format
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

# tokenisation
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("tokenized")

# convert the text to lowercase, empty string and special character
normalizer = (
    Normalizer()
    .setInputCols(["tokenized"])
    .setOutputCol("normalized")
    .setLowercase(True)
)

# lemmtizing tokens
lemmatizer = (
    LemmatizerModel.pretrained()
    .setInputCols(["normalized"])
    .setOutputCol("lemmatized")
)

# remove stop words,
eng_stopwords = stopwords.words("english")
stopwords_cleaner = (
    StopWordsCleaner()
    .setInputCols(["lemmatized"])
    .setOutputCol("no_stop_lemmatized")
    .setStopWords(eng_stopwords)
)

pos_tagger = (
    PerceptronModel.pretrained("pos_anc")
    .setInputCols(["document", "lemmatized"])
    .setOutputCol("pos")
)

allowed_tags = ["<JJ>+<NN>", "<NN>+<NN>"]
chunker = (
    Chunker()
    .setInputCols(["document", "pos"])
    .setOutputCol("ngrams")
    .setRegexParsers(allowed_tags)
)
finisher = Finisher().setInputCols(
    ["no_stop_lemmatized", "normalized", "tokenized"]
)

pipeline = Pipeline().setStages(
    [
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        pos_tagger,
        chunker,
        finisher,
    ]
)

lemma_antbnc download started this may take some time.


24/06/11 18:46:32 WARN BasicProfileConfigLoader: Your profile name includes a 'profile ' prefix. This is considered part of the profile name in the Java SDK, so you will need to include this prefix in your profile name when you reference this profile from your Java code.


Approximate size to download 907.6 KB
[ / ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ — ]Download done! Loading the resource.


                                                                                

[OK!]


24/06/11 18:47:37 WARN StopWordsCleaner: Default locale set was [en_PT]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[ | ]pos_anc download started this may take some time.
Approximate size to download 3.9 MB
Download done! Loading the resource.
[ / ]



[ \ ]

                                                                                

[ | ]



CodeCache: size=131072Kb used=35668Kb max_used=35674Kb free=95403Kb
 bounds [0x0000000106010000, 0x0000000108330000, 0x000000010e010000]
 total_blobs=13716 nmethods=12668 adapters=960
 compilation: disabled (not enough contiguous free space left)
[OK!]


In [10]:
processed_df = pipeline.fit(df).transform(df)
processed_df = processed_df.withColumn(
        "processed_text", array_join(processed_df["finished_no_stop_lemmatized"], " ")
    )

In [11]:
sentiment_schema = StructType(
        [
            StructField("compound", DoubleType()),
            StructField("positive", DoubleType()),
            StructField("neutral", DoubleType()),
            StructField("negative", DoubleType()),
        ]
    )

In [12]:
def get_sentiment(text):
        vs = analyzer.polarity_scores(text)
        return (vs["compound"], vs["pos"], vs["neu"], vs["neg"])

sentiment_udf = udf(get_sentiment, sentiment_schema)

sentiment_df = processed_df.withColumn(
    "sentiment", sentiment_udf(col("processed_text"))
)

In [13]:
sentiment__df = (
    sentiment_df.withColumn("compound", col("sentiment.compound"))
    .withColumn("positive", col("sentiment.positive"))
    .withColumn("neutral", col("sentiment.neutral"))
    .withColumn("negative", col("sentiment.negative"))
    .drop("sentiment")
)

In [14]:
sentiment__df.show(5)

[Stage 9:>                                                          (0 + 1) / 1]

+-------+------------------+-------------+-----+----------+---------+--------------------+--------------------+-------------------+---------------------------+--------------------+--------------------+--------------------+--------+--------+-------+--------+
|     id|            author|  created_utc|score| parent_id|subreddit|           permalink|                text|          timestamp|finished_no_stop_lemmatized| finished_normalized|  finished_tokenized|      processed_text|compound|positive|neutral|negative|
+-------+------------------+-------------+-----+----------+---------+--------------------+--------------------+-------------------+---------------------------+--------------------+--------------------+--------------------+--------+--------+-------+--------+
|l70kegt|5chrodingers_pussy|1.717472683E9|    2|t1_l70ii9i| JoeRogan|/r/JoeRogan/comme...|You bring up a co...|2024-06-04 05:44:43|       [bring, couple, s...|[you, bring, up, ...|[You, bring, up, ...|bring couple some...|  0.

                                                                                

In [15]:
sentiment__df.write.mode("append").parquet(output_dir)

24/06/11 18:49:34 WARN DAGScheduler: Broadcasting large task binary with size 1200.7 KiB
24/06/11 18:49:41 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
24/06/11 18:49:41 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
24/06/11 18:49:41 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
                                                                                

In [18]:
df_result = spark.read.parquet("data/sentiment/*.parquet")

                                                                                

In [19]:
df_result.show(5)

+-------+------------------+-------------+-----+----------+---------+--------------------+--------------------+-------------------+---------------------------+--------------------+--------------------+--------------------+--------+--------+-------+--------+
|     id|            author|  created_utc|score| parent_id|subreddit|           permalink|                text|          timestamp|finished_no_stop_lemmatized| finished_normalized|  finished_tokenized|      processed_text|compound|positive|neutral|negative|
+-------+------------------+-------------+-----+----------+---------+--------------------+--------------------+-------------------+---------------------------+--------------------+--------------------+--------------------+--------+--------+-------+--------+
|l70kegt|5chrodingers_pussy|1.717472683E9|    2|t1_l70ii9i| JoeRogan|/r/JoeRogan/comme...|You bring up a co...|2024-06-04 05:44:43|       [bring, couple, s...|[you, bring, up, ...|[You, bring, up, ...|bring couple some...|  0.

In [20]:
 0.177 +  0.679 +   0.144

1.0

24/06/11 21:28:29 WARN TransportChannelHandler: Exception in connection from /10.10.5.83:64002
java.io.IOException: Operation timed out
	at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
	at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
	at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
	at sun.nio.ch.IOUtil.read(IOUtil.java:192)
	at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:378)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:254)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:357)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:724)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:6