# Task 5

### Find friends. The idea is, if users like each other posts, then they are friends

In [1]:
import findspark
import pyspark
import pyspark.sql.functions
import os

In [2]:
# pyspark import as regular library
os.environ["SPARK_HOME"] = "C:/spark"
findspark.init()

In [3]:
# pyspark session and SQL context creation
configs = pyspark.SparkConf().setAppName("task5").setMaster("local")
spark_context = pyspark.SparkContext(conf=configs)
spark = pyspark.sql.SparkSession(spark_context)
sql_context = pyspark.sql.SQLContext(spark_context)

In [10]:
# Followers posts likes data loading, clearing from 
#'self likes' and repetitions in ownerId - likerId
data_followers_posts_likes = spark.read.load("followers_posts_likes.parquet/*.parquet").\
    select("ownerId", "likerId").\
    where(pyspark.sql.functions.col("ownerId") != pyspark.sql.functions.col("likerId")).distinct()

# Friends data creation: (id_user, Array (his_friend_ids))
data_friends = data_followers_posts_likes.alias("t1").\
    join(data_followers_posts_likes.alias("t2"), 
         (pyspark.sql.functions.col("t1.ownerId") == pyspark.sql.functions.col("t2.likerId")) &
         (pyspark.sql.functions.col("t1.likerId") == pyspark.sql.functions.col("t2.ownerId"))).\
    select(pyspark.sql.functions.col("t1.ownerId").alias("id_user"), 
           pyspark.sql.functions.col("t2.ownerId").alias("id_friend")).\
    sort("id_friend").groupby("id_user").agg(pyspark.sql.functions.collect_list("id_friend")).sort("id_user")

# Friends data showing
data_friends.show()

# Friends data saving
data_friends.coalesce(1).write.format("json").mode("overwrite").save("task5_output.json")

+-------+-----------------------+
|id_user|collect_list(id_friend)|
+-------+-----------------------+
|    637|   [1567, 94494, 815...|
|   1119|              [2004962]|
|   1127|   [27857, 317799, 2...|
|   1174|   [2134327, 139499389]|
|   1567|     [637, 2212, 94494]|
|   2212|                 [1567]|
|   4023|   [1548876, 1034920...|
|   7373|   [180092, 317799, ...|
|   8909|      [27905, 12742533]|
|  12671|   [12977, 234753, 3...|
|  12977|   [12671, 269559, 3...|
|  15221|   [25554, 50601, 36...|
|  18589|                [18751]|
|  18751|                [18589]|
|  18994|                [45781]|
|  20972|                [29840]|
|  21571|               [410199]|
|  22304|   [27857, 507824, 3...|
|  24147|   [27419, 81102, 42...|
|  24770|              [4656597]|
+-------+-----------------------+
only showing top 20 rows

