In [5]:
from pyspark.sql import SparkSession
import timeit 

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.131:7077") \
        .appName("Team7_code.ipynb")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.cores.max",2)\
        .config("spark.worker.instances",5)\
        .config("spark.executor.cores",1)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()
#\#)\
# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("INFO")
print("Started Session")

Started Session


In [6]:
def create_spark_dataframe(filename):
    df = spark_session.read.json('hdfs://192.168.2.131:9000/user/ubuntu/{}'.format(filename))
    return df

start_time_0 = timeit.default_timer() 

#df = create_spark_dataframe('RC_2005-12.json')
#df = create_spark_dataframe('RC_2011-08.json')
df = create_spark_dataframe('RC_2009-05.json')
df.printSchema()
print("execution time {}s".format(timeit.default_timer()-start_time_0))

root
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)

execution time 22.259210607997375s


In [67]:
from pyspark.sql.functions import desc, col

#num_comments = df.count()
#print('number of comments: {}'.format(num_comments))

start_time_0 = timeit.default_timer() 

author_comment_counts = df.groupBy('author').count()
author_comment_counts.cache()
author_comment_counts.sort(desc("count")).show()
print("execution time {}s".format(timeit.default_timer()-start_time_0))


+------------------+-------+
|            author|  count|
+------------------+-------+
|         [deleted]|2442948|
|         mileylols|   4594|
|            Lots42|   4047|
|       GenJonesMom|   3294|
|     dispatcher_83|   3006|
|           JBgreen|   2817|
|            Spongi|   2731|
|Release_the_KRAKEN|   2680|
|          Osmodius|   2579|
|      redhatnation|   2481|
|      mavriksfan11|   2465|
|   original-finder|   2421|
|    GhostedAccount|   2406|
|         Warlizard|   2296|
|       StickDoctor|   2273|
|           Moridyn|   2053|
|          Ali-Sama|   2041|
|         edubation|   2009|
|          G_Morgan|   1974|
|       silverhydra|   1952|
+------------------+-------+
only showing top 20 rows

execution time 70.14901283899962s


In [7]:

start_time_0 = timeit.default_timer() 
#author_comment_counts.select("count").rdd.map(tuple).map(lambda x: x[0]).map(lambda x: x**2).collect()
df.select("score").rdd.map(tuple).map(lambda x: x[0]).map(lambda x: x**2).collect()

print("execution time {}s".format(timeit.default_timer()-start_time_0))

execution time 8.168689876009012s


In [32]:
start_time_0 = timeit.default_timer() 
author_score = df.groupBy('author').sum('score')
author_score.sort(desc("sum(score)")).show()
print("execution time {}s".format(timeit.default_timer()-start_time_0))

+-------------+----------+
|       author|sum(score)|
+-------------+----------+
|   paulgraham|        98|
|     mattknox|        77|
|    [deleted]|        58|
|      bugbear|        43|
| michaelneale|        39|
|     dstowell|        38|
|         spez|        36|
|      AaronSw|        30|
|     JimThome|        29|
|        sempf|        29|
|    bolinfest|        28|
|       davidw|        27|
|          Zak|        26|
|     symbiont|        25|
|     binladen|        24|
|     enjahova|        23|
|         cg84|        23|
|brendankohler|        21|
|       dylanm|        20|
|     beastboy|        19|
+-------------+----------+
only showing top 20 rows

execution time 2.5746123989956686s


In [40]:
start_time_0 = timeit.default_timer() 

author_df = author_comment_counts.join(author_score, author_comment_counts.author == author_score.author).withColumnRenamed("sum(score)", "score")
author_df.withColumn('Average Score', author_df['score']/author_df['count']).sort(desc('Average Score')).show()
print("execution time {}s".format(timeit.default_timer()-start_time_0))

root
 |-- author: string (nullable = true)
 |-- count: long (nullable = false)
 |-- author: string (nullable = true)
 |-- score: long (nullable = true)

+--------------+-----+--------------+-----+-----------------+
|        author|count|        author|score|    Average Score|
+--------------+-----+--------------+-----+-----------------+
|          Grue|    1|          Grue|   16|             16.0|
|TenebraeVision|    1|TenebraeVision|   16|             16.0|
|        fergie|    1|        fergie|   12|             12.0|
|        bstard|    1|        bstard|   11|             11.0|
|         sempf|    3|         sempf|   29|9.666666666666666|
|     bolinfest|    4|     bolinfest|   28|              7.0|
|    HiggsBoson|    1|    HiggsBoson|    7|              7.0|
|      kanagawa|    1|      kanagawa|    7|              7.0|
|    paulgraham|   15|    paulgraham|   98|6.533333333333333|
|         arkas|    2|         arkas|   13|              6.5|
|      dstowell|    6|      dstowell|   3

In [4]:
# release the cores for another application!
spark_context.stop()
print("Ended Session")

Ended Session
