# Task 3

### Get reposts of the original posts of the itmo group (posts.json) from user posts (the result should be similar to (group_post_id, Array (user_post_ids)))

In [1]:
import pyspark
import pyspark.sql.functions
import os

In [2]:
# Configuration of SPARK_HOME system variable
os.environ["SPARK_HOME"] = "C:/spark"

In [3]:
# pyspark session and SQL context creation
configs = pyspark.SparkConf().setAppName("task3").setMaster("local")
spark_context = pyspark.SparkContext(conf=configs)
spark = pyspark.sql.SparkSession(spark_context)
sql_context = pyspark.sql.SQLContext(spark_context)

In [4]:
# Followers data loading
data_followers = spark.read.load("followers.parquet/*.parquet")

# Followers posts loading and copy history extracting
data_copy_history = sql_context.read.json("followers_posts_api_final.json/*.json").\
    withColumn("copy_history", pyspark.sql.functions.explode("copy_history"))

# Copy history filtering
data_copy_history_filtered = data_copy_history.filter("copy_history.owner_id == '-94'")

# Reposts table creating similar to (group_post_id, Array (user_post_ids))
data_reposts = data_copy_history_filtered.select(
    data_copy_history_filtered.copy_history.id.alias("id_group_post"), 
    data_copy_history_filtered.id.alias("id_user_post")).sort("copy_history.id").\
    groupby("id_group_post").agg(pyspark.sql.functions.collect_list("id_user_post")).\
    sort("id_group_post")

# Reposts showing
data_reposts.show()

# Reposts saving
data_reposts.coalesce(1).write.format("json").mode("overwrite").save("task3_output.json")

+-------------+--------------------------+
|id_group_post|collect_list(id_user_post)|
+-------------+--------------------------+
|        38730|        [9523, 2590, 8187]|
|        38738|                    [8188]|
|        38740|      [1060, 31900, 113...|
|        38748|                    [9574]|
|        38751|                    [8054]|
|        38754|                   [10318]|
|        38755|                     [303]|
|        38764|                    [5076]|
|        38767|                [778, 364]|
|        38791|                     [622]|
|        38814|                    [9678]|
|        38818|                    [9664]|
|        38823|                    [3512]|
|        38847|              [3373, 9697]|
|        38854|                    [2613]|
|        38857|                    [2618]|
|        38858|                    [2620]|
|        38859|                    [2633]|
|        38862|                    [2642]|
|        38867|              [2644, 2914]|
+----------