In [1]:
from pyspark.sql import SparkSession

spark_session = SparkSession\
        .builder\
        .master("spark://group4-0:7077") \
        .appName("DE-1-4-SparkSession")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

24/03/04 12:02:42 WARN Utils: Your hostname, group4-0 resolves to a loopback address: 127.0.0.1; using 192.168.2.91 instead (on interface ens3)
24/03/04 12:02:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/04 12:02:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
reddit_df = spark_session.read.json("hdfs://localhost:9000/user/ubuntu/corpus-webis-tldr-17.json")
reddit_df.show()
reddit_df.printSchema()

                                                                                

+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|            author|                body|             content|content_len|     id|      normalizedBody|           subreddit|subreddit_id|             summary|summary_len|   title|
+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|  raysofdarkmatter|I think it should...|I think it should...|        178|c69al3r|I think it should...|                math|    t5_2qh0n|Shifting seasonal...|          8|    NULL|
|           Stork13|Art is about the ...|Art is about the ...|        148|c6a9nxd|Art is about the ...|               funny|    t5_2qh33|Personal opinions...|          4|    NULL|
|     Cloud_dreamer|Ask me what I thi...|Ask me what I thi...|         76|c6acx4l|Ask me what I thi.

In [6]:
path_pos = "./opinion-lexicon-English/positive-words.txt"
path_neg = "./opinion-lexicon-English/negative-words.txt"
positive_words = set()
negative_words = set()
with open(path_pos, 'r', encoding='utf-8') as file:
    for line in file:
        positive_words.add(line.strip())
with open(path_neg, 'r', encoding='utf-8') as file:
    for line in file:
        negative_words.add(line.strip())

In [8]:
print(reddit_df.first())

Row(author='raysofdarkmatter', body="I think it should be fixed on either UTC standard or UTC+1 year around, with the current zone offsets.\n\nMoving timescales add a lot of complexity to the implementation of timekeeping systems and have [dubious value]( \n\nI think seasonal shifting time made sense in the pre-electric past, when timekeeping was more flexible and artificial light was inefficient and often dangerous. \n\nNow we have machines that work easily with simple timekeeping rules, and it's more beneficial to spend a small amount on energy for lighting, and save the larger cost of engineering things to work with the complex timekeeping rules, as well as saving the irritation to humans.\n\nLighting has gotten much more efficient over time; we can squeeze out a lot more photons per unit of energy from a 2012 CFL or LED than a candle could in 1780, or a lightbulb could in 1950. \n\nThere's a lot of room for improvement in how we use lights as well; as lighting control gets more int

In [7]:
reddit_df = reddit_df.drop("author", "body", "normalizedBody", "id", "subreddit_id", "title")
reddit_df.printSchema()


root
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)



In [8]:
sampled_reddit_df = reddit_df.sample(False, 0.05)
sampled_reddit_df = sampled_reddit_df.filter(sampled_reddit_df["subreddit"] != "NULL")

In [9]:
broadcast_positive_words = spark_session.sparkContext.broadcast(positive_words)
broadcast_negative_words = spark_session.sparkContext.broadcast(negative_words)

In [10]:
import re
def pre_process(summary):
    line = summary.lower()
    line = re.sub(r"[.,]",'',line).split(" ")
    positive_count = sum([word in broadcast_positive_words.value for word in line])
    negative_count = sum([word in broadcast_negative_words.value for word in line])
    if positive_count > negative_count:
        return (1,positive_count,negative_count)
    elif negative_count > positive_count:
        return (-1,positive_count,negative_count)
    else:
        return (0, positive_count, negative_count)
    

In [11]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType, StructType, StructField

schema = StructType([
    StructField("sentiment", IntegerType(), False),
    StructField("positive_count", IntegerType(), False),
    StructField("negative_count", IntegerType(), False)
])

conotation_udf = udf(pre_process, schema)
reddit_df_conotation = sampled_reddit_df.withColumn("conotation", conotation_udf("summary"))
reddit_df_conotation.printSchema()
print(reddit_df_conotation.first()['conotation'])

root
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- conotation: struct (nullable = true)
 |    |-- sentiment: integer (nullable = false)
 |    |-- positive_count: integer (nullable = false)
 |    |-- negative_count: integer (nullable = false)



[Stage 2:>                                                          (0 + 1) / 1]

Row(sentiment=-1, positive_count=0, negative_count=1)


                                                                                

In [113]:
for i in reddit_df_conotation.take(10):
    print(i['conotation'])
    #print(f"Positive count: {i['conotation']['positive_count']}, negative count: {i['conotation']['negative_count']}")
    print(i['summary'])

Row(sentiment=0, positive_count=0, negative_count=0)
Class only items dropped from high-lvl monsters.
Row(sentiment=0, positive_count=0, negative_count=0)
OPI Nail Envy!
Row(sentiment=1, positive_count=2, negative_count=1)
get a good CPA - they aren't that expensive but are 100% worth it
Row(sentiment=0, positive_count=1, negative_count=1)
just because you're close "at times" doesn't mean you didn't get stomped in a best of 5 that you lost 6 games in.
Row(sentiment=0, positive_count=0, negative_count=0)
It's a half-assed fan-art that literally put effort into one-half of the picture.
Row(sentiment=0, positive_count=0, negative_count=0)
it is possible to be in a race you didnt know you were in.
Row(sentiment=-1, positive_count=4, negative_count=12)
As if Sweep/Smash spec wasn't already borderline OP in PvP, prepare to see a lot more of them come 1.4 buffs. 
 EDIT: Don't want to give the impression that I thought Focus spec was totally and absolutely broken. It does have weaknesses like 

In [12]:
grouped = reddit_df_conotation.rdd.map(lambda x: (x["subreddit"], x["conotation"]["sentiment"]))
grouped.take(10)

                                                                                

[('linux', -1),
 ('uwaterloo', 1),
 ('AskReddit', 0),
 ('AskReddit', 0),
 ('funny', 0),
 ('minimalism', -1),
 ('gardening', 0),
 ('funny', 1),
 ('orlando', -1),
 ('leagueoflegends', 0)]

In [13]:
from operator import add
grouped = grouped.reduceByKey(add)
grouped.take(1)

                                                                                

[('KerbalSpaceProgram', 2)]

In [17]:
grouped = grouped.sortBy(lambda x: x[1], ascending = False)
grouped.take(10)

                                                                                

[('leagueoflegends', 337),
 ('buildapc', 217),
 ('summonerschool', 152),
 ('starcraft', 89),
 ('electronic_cigarette', 87),
 ('seduction', 85),
 ('malefashionadvice', 78),
 ('magicTCG', 72),
 ('DotA2', 67),
 ('Guitar', 58)]

In [18]:
grouped = grouped.sortBy(lambda x: x[1], ascending = True)
grouped.take(10)

                                                                                

[('AskReddit', -6103),
 ('relationships', -3749),
 ('tifu', -869),
 ('relationship_advice', -379),
 ('WTF', -348),
 ('funny', -324),
 ('trees', -302),
 ('offmychest', -256),
 ('atheism', -256),
 ('depression', -248)]