In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from graphframes import * 

vertices = spark.createDataFrame([
  ("1", "Alex", 28, "M", "MIPT"),
  ("2", "Emeli", 28, "F", "MIPT"),
  ("3", "Natasha", 27, "F", "SPbSU"),
  ("4", "Pavel", 30, "M", "MIPT"),
  ("5", "Oleg", 35, "M", "MIPT"),
  ("6", "Ivan", 30, "M", "MSU"),
  ("7", "Ilya", 29, "M", "MSU")], ["id", "name", "age", "gender", "university"])

edges = sqlCtx.createDataFrame([
  ("1", "2", "friend"),
  ("2", "1", "friend"),
  ("1", "3", "friend"),
  ("3", "1", "friend"),
  ("1", "4", "friend"),
  ("4", "1", "friend"),
  ("2", "3", "friend"), 
  ("3", "2", "friend"),
  ("2", "5", "friend"),
  ("5", "2", "friend"),
  ("3", "4", "friend"),
  ("4", "3", "friend"),
  ("3", "5", "friend"),
  ("5", "3", "friend"),
  ("3", "6", "friend"),
  ("6", "3", "friend"),
  ("3", "7", "friend"),
  ("7", "3", "friend")
], ["src", "dst", "relationship"])

gf = GraphFrame(vertices, edges)

In [3]:
gf.vertices.filter("age > 30").show()

+---+----+---+------+----------+
| id|name|age|gender|university|
+---+----+---+------+----------+
|  5|Oleg| 35|     M|      MIPT|
+---+----+---+------+----------+



In [4]:
gf.inDegrees.filter("inDegree > 2").show()

+---+--------+
| id|inDegree|
+---+--------+
|  3|       6|
|  1|       3|
|  2|       3|
+---+--------+



In [5]:
gf.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  7|       1|
|  3|       6|
|  5|       2|
|  6|       1|
|  1|       3|
|  4|       2|
|  2|       3|
+---+--------+



In [6]:
gf.find("(a)-[e]->(b); (b)-[e2]->(a)").show()

+--------------------+--------------+--------------------+--------------+
|                   a|             e|                   b|            e2|
+--------------------+--------------+--------------------+--------------+
|[1, Alex, 28, M, ...|[1, 4, friend]|[4, Pavel, 30, M,...|[4, 1, friend]|
|[4, Pavel, 30, M,...|[4, 1, friend]|[1, Alex, 28, M, ...|[1, 4, friend]|
|[3, Natasha, 27, ...|[3, 2, friend]|[2, Emeli, 28, F,...|[2, 3, friend]|
|[2, Emeli, 28, F,...|[2, 1, friend]|[1, Alex, 28, M, ...|[1, 2, friend]|
|[2, Emeli, 28, F,...|[2, 5, friend]|[5, Oleg, 35, M, ...|[5, 2, friend]|
|[3, Natasha, 27, ...|[3, 5, friend]|[5, Oleg, 35, M, ...|[5, 3, friend]|
|[1, Alex, 28, M, ...|[1, 3, friend]|[3, Natasha, 27, ...|[3, 1, friend]|
|[3, Natasha, 27, ...|[3, 1, friend]|[1, Alex, 28, M, ...|[1, 3, friend]|
|[5, Oleg, 35, M, ...|[5, 3, friend]|[3, Natasha, 27, ...|[3, 5, friend]|
|[2, Emeli, 28, F,...|[2, 3, friend]|[3, Natasha, 27, ...|[3, 2, friend]|
|[3, Natasha, 27, ...|[3, 7, friend]|[

In [7]:
motifs = gf.find("(a)-[]->(b); (b)-[]->(c)")
motifs.show()

+--------------------+--------------------+--------------------+
|                   a|                   b|                   c|
+--------------------+--------------------+--------------------+
|[7, Ilya, 29, M, ...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[5, Oleg, 35, M, ...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[6, Ivan, 30, M, ...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[1, Alex, 28, M, ...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[4, Pavel, 30, M,...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[2, Emeli, 28, F,...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|[3, Natasha, 27, ...|
|[3, Natasha, 27, ...|[5, Oleg, 35, M, ...|[3, Natasha, 27, ...|
|[2, Emeli, 28, F,...|[5, Oleg, 35, M, ...|[3, Natasha, 27, ...|
|[3, Natasha, 27, ...|[6, Ivan, 30, M, ...|[3, Natasha, 27, ...|
|[3, Natasha, 27, ...|[1, Alex, 28, M, ...|[3, Natasha, 27, ...|
|[4, Pavel, 30, M,...|[1, Alex, 28, M, ...|[3, Natasha, 27, ...|
|[2, Emeli, 28, F,...|[1,

In [8]:
motifs = gf.find("(A)-[]->(B); (B)-[]->(C)").filter("A.id != C.id")
motifs.show()

+--------------------+--------------------+--------------------+
|                   A|                   B|                   C|
+--------------------+--------------------+--------------------+
|[5, Oleg, 35, M, ...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[6, Ivan, 30, M, ...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[4, Pavel, 30, M,...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[2, Emeli, 28, F,...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[1, Alex, 28, M, ...|[3, Natasha, 27, ...|[7, Ilya, 29, M, ...|
|[2, Emeli, 28, F,...|[5, Oleg, 35, M, ...|[3, Natasha, 27, ...|
|[4, Pavel, 30, M,...|[1, Alex, 28, M, ...|[3, Natasha, 27, ...|
|[2, Emeli, 28, F,...|[1, Alex, 28, M, ...|[3, Natasha, 27, ...|
|[1, Alex, 28, M, ...|[4, Pavel, 30, M,...|[3, Natasha, 27, ...|
|[5, Oleg, 35, M, ...|[2, Emeli, 28, F,...|[3, Natasha, 27, ...|
|[1, Alex, 28, M, ...|[2, Emeli, 28, F,...|[3, Natasha, 27, ...|
|[7, Ilya, 29, M, ...|[3, Natasha, 27, ...|[5, Oleg, 35, M, ...|
|[6, Ivan, 30, M, ...|[3,

In [9]:
AC = motifs.selectExpr("A.id as A", "C.id as C")
AC.show()

+---+---+
|  A|  C|
+---+---+
|  5|  7|
|  6|  7|
|  1|  7|
|  4|  7|
|  2|  7|
|  2|  3|
|  1|  3|
|  4|  3|
|  2|  3|
|  5|  3|
|  1|  3|
|  7|  5|
|  6|  5|
|  1|  5|
|  4|  5|
|  2|  5|
|  3|  5|
|  1|  5|
|  7|  6|
|  5|  6|
+---+---+
only showing top 20 rows



In [10]:
AC.groupBy("A", "C").count().filter("A = 1").show()

+---+---+-----+
|  A|  C|count|
+---+---+-----+
|  1|  4|    1|
|  1|  2|    1|
|  1|  5|    2|
|  1|  3|    2|
|  1|  7|    1|
|  1|  6|    1|
+---+---+-----+



In [10]:
AC.groupBy("A", "C").count().show()

+---+---+-----+
|  A|  C|count|
+---+---+-----+
|  7|  1|    1|
|  4|  2|    2|
|  4|  1|    1|
|  5|  4|    1|
|  1|  4|    1|
|  4|  6|    1|
|  5|  1|    2|
|  2|  3|    2|
|  7|  6|    1|
|  5|  7|    1|
|  1|  2|    1|
|  5|  2|    1|
|  2|  6|    1|
|  2|  4|    2|
|  5|  3|    1|
|  1|  5|    2|
|  3|  1|    2|
|  1|  3|    2|
|  6|  7|    1|
|  6|  2|    1|
+---+---+-----+
only showing top 20 rows



In [12]:
results = gf.triangleCount()
results.show()

+-----+---+-------+---+------+----------+
|count| id|   name|age|gender|university|
+-----+---+-------+---+------+----------+
|    0|  7|   Ilya| 29|     M|       MSU|
|    3|  3|Natasha| 27|     F|     SPbSU|
|    1|  5|   Oleg| 35|     M|      MIPT|
|    0|  6|   Ivan| 30|     M|       MSU|
|    2|  1|   Alex| 28|     M|      MIPT|
|    1|  4|  Pavel| 30|     M|      MIPT|
|    2|  2|  Emeli| 28|     F|      MIPT|
+-----+---+-------+---+------+----------+



### Page Rank

In [13]:
results = gf.pageRank(resetProbability=0.15, maxIter=10)
results.vertices.show()

+---+-------+---+------+----------+-------------------+
| id|   name|age|gender|university|           pagerank|
+---+-------+---+------+----------+-------------------+
|  1|   Alex| 28|     M|      MIPT|  1.120863948063202|
|  3|Natasha| 27|     F|     SPbSU|  2.239465678054025|
|  2|  Emeli| 28|     F|      MIPT|  1.120863948063202|
|  4|  Pavel| 30|     M|      MIPT| 0.7882518740579809|
|  7|   Ilya| 29|     M|       MSU|0.47115133885180416|
|  6|   Ivan| 30|     M|       MSU|0.47115133885180416|
|  5|   Oleg| 35|     M|      MIPT| 0.7882518740579809|
+---+-------+---+------+----------+-------------------+



In [14]:
results = gf.pageRank(resetProbability=0.15, tol=0.01)
results.vertices.show()

+---+-------+---+------+----------+------------------+
| id|   name|age|gender|university|          pagerank|
+---+-------+---+------+----------+------------------+
|  1|   Alex| 28|     M|      MIPT| 1.122938128138013|
|  3|Natasha| 27|     F|     SPbSU|2.2125072379360793|
|  2|  Emeli| 28|     F|      MIPT| 1.122938128138013|
|  4|  Pavel| 30|     M|      MIPT|0.7933962989298501|
|  7|   Ilya| 29|     M|       MSU|0.4774119539640973|
|  6|   Ivan| 30|     M|       MSU|0.4774119539640973|
|  5|   Oleg| 35|     M|      MIPT|0.7933962989298501|
+---+-------+---+------+----------+------------------+



### Connected components

In [15]:
vertices = spark.createDataFrame([
  ("1", "Alex", 28, "M", "MIPT"),
  ("2", "Emeli", 28, "F", "MIPT"),
  ("3", "Natasha", 27, "F", "SPbSU"),
  ("4", "Pavel", 30, "M", "MIPT"),
  ("5", "Oleg", 35, "M", "MIPT"),
  ("6", "Ivan", 30, "M", "MSU"),
  ("7", "Ilya", 29, "M", "MSU")], ["id", "name", "age", "gender", "university"])

edges = spark.createDataFrame([
  ("1", "2", "friend"),
  ("1", "4", "friend"),
  ("3", "5", "friend"),
  ("3", "6", "friend"),
  ("3", "7", "friend")
], ["src", "dst", "type"])

reduced_g = GraphFrame(vertices, edges)

In [16]:
spark.sparkContext.setCheckpointDir("/tmp")
result = reduced_g.connectedComponents()
result.select("id", "component").orderBy("component").show()

+---+-------------+
| id|    component|
+---+-------------+
|  6|  25769803776|
|  7|  25769803776|
|  3|  25769803776|
|  5|  25769803776|
|  1|1236950581248|
|  2|1236950581248|
|  4|1236950581248|
+---+-------------+



###  People you may know

In [17]:
from pyspark.sql.functions import abs, col, explode, split
from pyspark.sql.types import *

In [18]:
spark.conf.set("spark.sql.shuffle.partitions", 1000)

In [21]:
dataPath = "/share/graphx"
graphPath = dataPath + "/trainGraph"
usersToPredictPath = dataPath + "/prediction.csv"

In [22]:
data = spark.read.format("csv").option("delimiter", "\t")\
    .load(graphPath).withColumnRenamed("_c0", "user").withColumnRenamed("_c1", "friendsString")
    

In [23]:
from pyspark.sql.functions import udf

def cutStartEndBrackets(s):
    return s[2:-2]

cutStartEndBracketsUDF = udf(cutStartEndBrackets, StringType())

userFriend = \
    data.select(col("user"), split(cutStartEndBracketsUDF(col("friendsString")), "\),\(").alias("friendsMasks"))\
    .withColumn("friendMask", explode('friendsMasks'))\
    .withColumn("dst", split(col("friendMask"), ",")[0])\
    .withColumn("src", col("user"))\
    .select(col("src").cast("integer"), col("dst").cast("integer"))

userFriendSymmetric = \
    userFriend\
        .withColumn("tmp", col("src"))\
        .withColumn("src", col("dst"))\
        .withColumn("dst", col("tmp"))\
        .select("src", "dst")
        
edges = userFriend.unionAll(userFriendSymmetric)

vertices = edges.select(col("src").alias("id")).distinct()

In [24]:
usersFriends = GraphFrame(vertices, edges)

In [25]:
motifs = usersFriends.find("(A)-[]->(B); (B)-[]->(C)").filter("A.id != C.id")

In [None]:
AC = motifs.selectExpr("A.id as A", "C.id as C").groupBy("A", "C").count()

AC.write.mode("overwrite").parquet(dataPath + "/AC")