In [1]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql import SQLContext

In [2]:
spark = SparkSession.builder.appName("Python Spark SQL").config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12").getOrCreate()
sqlContext = SQLContext(spark.sparkContext)

In [3]:
from graphframes import *

In [4]:
v = sqlContext.createDataFrame([
  ("Paul Harvey",1),
  ("Bruno Merin",0),
  ("Tracy L. Huard",0),
  ("Luisa M. Rebull",0),
  ("Nicholas Chapman",1),
  ("Neal J. Evans II",1),
  ("Philip C. Myers",0),
], ["id","submitter"])

In [5]:
e = sqlContext.createDataFrame([
("Paul Harvey", "Bruno Merin", "0704.test1"),
("Paul Harvey", "Tracy L. Huard", "0704.test1"),
("Paul Harvey", "Luisa M. Rebull", "0704.test1"),
("Paul Harvey", "Luisa M. Rebull", "0704.test3"),
("Nicholas Chapman", "Luisa M. Rebull", "0704.test3"),
("Neal J. Evans II", "Philip C. Myers", "0704.test2"),
], ["src", "dst", "articles"])

In [6]:
g = GraphFrame(v, e)

In [7]:
g.inDegrees.show()



+---------------+--------+
|             id|inDegree|
+---------------+--------+
|    Bruno Merin|       1|
|Luisa M. Rebull|       3|
| Tracy L. Huard|       1|
|Philip C. Myers|       1|
+---------------+--------+





In [8]:
g.outDegrees.show()

+----------------+---------+
|              id|outDegree|
+----------------+---------+
|     Paul Harvey|        4|
|Nicholas Chapman|        1|
|Neal J. Evans II|        1|
+----------------+---------+



In [9]:
g.edges.filter("articles = '0704.test3'").show()

+----------------+---------------+----------+
|             src|            dst|  articles|
+----------------+---------------+----------+
|     Paul Harvey|Luisa M. Rebull|0704.test3|
|Nicholas Chapman|Luisa M. Rebull|0704.test3|
+----------------+---------------+----------+



In [10]:
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")
result = g.connectedComponents()



In [11]:
result.show()

+----------------+---------+------------+
|              id|submitter|   component|
+----------------+---------+------------+
|     Paul Harvey|        1|128849018880|
|     Bruno Merin|        0|128849018880|
|  Tracy L. Huard|        0|128849018880|
| Luisa M. Rebull|        0|128849018880|
|Nicholas Chapman|        1|128849018880|
|Neal J. Evans II|        1|549755813888|
| Philip C. Myers|        0|549755813888|
+----------------+---------+------------+



In [14]:
result.where("submitter != 0").show()

+----------------+---------+------------+
|              id|submitter|   component|
+----------------+---------+------------+
|     Paul Harvey|        1|128849018880|
|Nicholas Chapman|        1|128849018880|
|Neal J. Evans II|        1|549755813888|
+----------------+---------+------------+



In [13]:
g.triangleCount().show()

+-----+----------------+---------+
|count|              id|submitter|
+-----+----------------+---------+
|    0|     Paul Harvey|        1|
|    0|     Bruno Merin|        0|
|    0|  Tracy L. Huard|        0|
|    0|Nicholas Chapman|        1|
|    0| Luisa M. Rebull|        0|
|    0| Philip C. Myers|        0|
|    0|Neal J. Evans II|        1|
+-----+----------------+---------+

