In [1]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [2]:
graphPath = "/data/graphDFSample"

In [3]:
from pyspark.sql.functions import explode, collect_list, size, col, row_number, sort_array, count, udf
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.sql import Window

graphData = sparkSession.read.parquet(graphPath)

In [4]:
friend_users = graphData.withColumn('friend', explode('friends'))\
        .groupBy('friend')\
        .agg(collect_list("user").alias("users"))\
        .withColumn("users", sort_array("users"))\
        .filter(size("users") > 1)        

In [None]:
def make_pairs(users):
    cnt = len(users)
    result=list()
    for i in range(cnt - 1):
        for j in range(i+1, cnt):
            result.append([users[i], users[j]])
    return result

pairs_udf = udf(make_pairs, ArrayType(ArrayType(IntegerType())))

In [None]:
result = friend_users.select(pairs_udf("users").alias("pairs"))\
                    .withColumn("user_pair", explode("pairs"))\
                    .groupBy("user_pair")\
                    .agg(count("user_pair").alias("common_friends_count"))\
                    .select(col("common_friends_count"),
                            col("user_pair")[0].alias("user_1"), 
                            col("user_pair")[1].alias("user_2"))


In [None]:
window = Window.orderBy(col("common_friends_count").desc(),
                       col("user_1").desc(),
                       col("user_2").desc())
    
top50 = result.withColumn("row_number", row_number().over(window)) \
            .filter(col("row_number") < 50) \
            .orderBy(col("common_friends_count").desc(),
                       col("user_1").desc(),
                       col("user_2").desc()) \
            .collect()

In [None]:
for value in top50:
    print '%s %s %s' % (value.common_friends_count, value.user_1, value.user_2)