# Task 2

### Display the top 20 users by likes and reposts made (for reposts use "copy_history", from the subset posts)

In [1]:
import findspark
import pyspark
import pyspark.sql.functions
import os

In [2]:
# pyspark import as regular library
os.environ["SPARK_HOME"] = "C:/spark"
findspark.init()

In [3]:
# pyspark session and SQL context creation
configs = pyspark.SparkConf().setAppName("task2").setMaster("local")
spark_context = pyspark.SparkContext(conf=configs)
spark = pyspark.sql.SparkSession(spark_context)
sql_context = pyspark.sql.SQLContext(spark_context)

In [6]:
# Posts likes loading
data_posts_likes = spark.read.load("posts_likes.parquet/*.parquet")

# Users likes counting and top 20 selecting
data_likes_count = data_posts_likes.groupby("likerId").count().\
    orderBy(pyspark.sql.functions.col("count").desc(), asc=False).limit(20)

# Top 20 users by likes showing
data_likes_count.show()
    
# Top 20 users by likes saving
data_likes_count.coalesce(1).write.format("json").mode("overwrite").\
    save("task2_output/top20_users_likes.json")

+---------+-----+
|  likerId|count|
+---------+-----+
|  2070090| 4801|
|  2397858| 2055|
|  1475301| 1829|
|    18239| 1569|
|   546612| 1245|
|     6371|  907|
|  1841959|  746|
| 78440957|  709|
|   120248|  699|
| 40981497|  611|
|    22158|  553|
|207628162|  548|
|329377723|  504|
| 76071304|  474|
| 14805173|  440|
|   317799|  385|
| 56355640|  375|
| 52042971|  338|
|  7437271|  336|
|136506644|  335|
+---------+-----+



In [8]:
# Followers posts loading and copy history extracting
data_copy_history = sql_context.read.json("followers_posts_api_final.json/*.json").\
    withColumn("copy_history", pyspark.sql.functions.explode("copy_history"))

# Users copy history counting and top 20 selecting
data_copy_history_count = data_copy_history.filter("copy_history.owner_id == '-94'").\
    groupby("owner_id").count().sort("count", ascending=False).limit(20)

# Top 20 users by reposts showing
data_copy_history_count.show()

# Top 20 users by reposts saving   
data_copy_history_count.coalesce(1).write.format("json").mode("overwrite").\
    save("task2_output/top20_users_reposts.json")

+---------+-----+
| owner_id|count|
+---------+-----+
|180907432|   48|
|   317799|   16|
|  4068532|   13|
|  2547211|    9|
|484122052|    8|
|268247082|    5|
|  1077823|    5|
|  2070090|    5|
|281951154|    5|
|217400123|    4|
|172808182|    4|
|  1533614|    4|
| 44361144|    4|
|527580876|    4|
|157728618|    4|
| 18467645|    4|
|168543860|    4|
|   256973|    3|
|    86002|    3|
|113773552|    3|
+---------+-----+

