In [3]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [4]:
spark.conf.set("spark.sql.shuffle.partitions", 500)

In [5]:
dataPath = "/share/graphx"
graphPath = dataPath + "/trainGraph"
usersToPredictPath = dataPath + "/prediction.csv"

In [6]:
data = spark.read.format("csv").option("delimiter", "\t")\
        .load(graphPath).withColumnRenamed("_c0", "user").withColumnRenamed("_c1", "friendsString")

In [7]:
data.show()

+-----+--------------------+
| user|       friendsString|
+-----+--------------------+
| 1424|{(846,0),(1691,25...|
| 4128|{(49747,0),(53568...|
| 4480|{(4677,0),(22256,...|
| 4656|{(520,0),(12380,0...|
| 5040|{(629,0),(2471,0)...|
| 6288|{(24231,49152),(2...|
| 9088|{(3887,32),(3921,...|
|13360|{(58570,0),(74833...|
|13568|{(852,0),(8729,0)...|
|16480|{(10189,0),(10679...|
|18048|{(24637,0),(24646...|
|20464|{(7425,17408),(15...|
|22256|{(4480,0),(9902,0...|
|23424|{(1306,0),(8843,0...|
|26320|{(5148,0),(5953,0...|
|27744|{(12310,0),(25763...|
|32128|{(26784,0),(39582...|
|32352|{(8700,1024),(357...|
|32896|{(9523,1),(12310,...|
|35856|{(540,0),(3880,0)...|
+-----+--------------------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import abs, col, explode, collect_list, sort_array, size, split, lit
from pyspark.sql.types import *

In [8]:
from pyspark.sql.functions import udf

def cutStartEndBrackets(s):
    return s[2:-2]

cutStartEndBracketsUDF = udf(cutStartEndBrackets, StringType())

In [9]:
userFriend = data.select(col("user"), split(cutStartEndBracketsUDF(col("friendsString")), "\),\(").alias("friendsMasks"))\
    .withColumn("friendMask", explode('friendsMasks'))\
    .withColumn("friend", split(col("friendMask"), ",")[0])\
    .select(col("user").cast("integer"), col("friend").cast("integer"))

usersWithCommonFriend = userFriend\
    .groupBy("friend")\
    .agg(sort_array(collect_list("user")).alias("usersWithCommonFriend"))\
    .where(size(col("usersWithCommonFriend")) >= 2)\
    .select(col("usersWithCommonFriend"))\
    .cache()

In [10]:
usersToPredictDF = spark.read.format("csv").option("delimiter", "\t")\
    .load(usersToPredictPath)\
    .withColumnRenamed("_c0", "user")

usersToPredict = set(int(user_row.user) for user_row in usersToPredictDF.collect())
usersToPredictBC = sc.broadcast(usersToPredict) 

In [11]:
from pyspark.sql.functions import udf

def pairsWithCommonFriend(usersWithCommonFriend):
    pairs = []
    for user1Index in range(0, len(usersWithCommonFriend) - 1):
         for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
            if user1Index in usersToPredictBC.value or user2Index in usersToPredictBC.value:
                pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
    return pairs

schema = ArrayType(StructType([
    StructField("user1", IntegerType(), False),
    StructField("user2", IntegerType(), False)
]))
         
pairsWithCommonFriendUdf = udf(pairsWithCommonFriend, schema)

pairsPath = dataPath + "/pairsUsersForPrediction"

usersWithCommonFriend\
        .select(pairsWithCommonFriendUdf("usersWithCommonFriend").alias("pairsWithCommonFriend"))\
        .where(size(col("pairsWithCommonFriend")) > 0)\
        .write.mode("overwrite").parquet(pairsPath)

In [12]:
spark.read.parquet(pairsPath)\
    .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
    .drop(col("pairsWithCommonFriend"))\
    .groupBy(col("pairWithCommonFriend"))\
    .count()\
    .write.mode("overwrite").parquet(dataPath + "/pairsCount")

### Из-за чего переполняется память executor'а?

In [15]:
usersWithCommonFriend \
    .withColumn("numberOfFriends", size(col("usersWithCommonFriend")))\
    .where(col("numberOfFriends") >= 1000)\
    .show()

+---------------------+---------------+
|usersWithCommonFriend|numberOfFriends|
+---------------------+---------------+
| [8952, 33884, 352...|           1559|
| [37023, 57666, 10...|           1703|
| [103433, 171851, ...|           1019|
| [31996, 32873, 35...|           1274|
| [31996, 48126, 50...|           1463|
| [369, 29332, 3199...|           1533|
| [31996, 35309, 87...|           1006|
| [35309, 62113, 80...|           1132|
| [74154, 135757, 1...|           1023|
| [369, 14436, 2933...|           1094|
| [49566, 50325, 54...|           1008|
| [369, 107165, 148...|           1441|
| [369, 2405, 29332...|           1001|
| [369, 31996, 3530...|           1188|
| [17669, 21647, 53...|           1161|
| [35309, 60539, 11...|           1512|
| [1583, 18068, 276...|           1411|
| [41415, 91246, 18...|           1176|
| [35309, 110726, 1...|           1143|
| [66691, 68502, 72...|           1058|
+---------------------+---------------+
only showing top 20 rows



In [16]:
def pairsWithCommonFriendDivided(usersWithCommonFriend, divider):
    pairs = []
    for user1Index in range(0, len(usersWithCommonFriend) - 1):
         for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
                if user1Index % 17 == divider:
                    pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
    return pairs

schema = ArrayType(StructType([
    StructField("user1", IntegerType(), False),
    StructField("user2", IntegerType(), False)
]))
         
pairsWithCommonFriendDividedUdf = udf(pairsWithCommonFriendDivided, schema)

pairsPathPart = dataPath + "/pairsPart"

In [17]:
for i in range(17):
    usersWithCommonFriend\
        .select(pairsWithCommonFriendDividedUdf("usersWithCommonFriend", lit(i)).alias("pairsWithCommonFriend"))\
        .where(size(col("pairsWithCommonFriend")) > 0)\
        .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
        .drop(col("pairsWithCommonFriend"))\
        .groupBy(col("pairWithCommonFriend"))\
        .count()\
        .write.mode("overwrite").parquet(pairsPathPart + str(i))

In [18]:
spark.read.parquet(pairsPathPart + "*")\
    .write.mode("overwrite").parquet(dataPath + "/pairsCount")