References: 
1. https://docs.databricks.com/spark/latest/graph-analysis/graphframes/user-guide-python.html

2. https://towardsdatascience.com/large-scale-graph-mining-with-spark-part-2-2c3d9ed15bb5

In [68]:
!pip install pyspark



In [69]:
from pyspark.sql import SparkSession

In [70]:
spark = SparkSession.builder\
                    .master("local[*]")\
                    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12")\
                    .getOrCreate()

In [71]:
from functools import reduce
from pyspark.sql.functions import col, lit, when
from graphframes import *

In [72]:
# Import the data from CSV
us_edgelist = spark.read.option("header", True).csv("us_edgelist.csv")
us_edgelist.show()

+--------+-----------+-------------+
|     src|        dst|relationships|
+--------+-----------+-------------+
| Clinton|    Hillary|      married|
|    Bush|      Laura|      married|
|   Obama|   Michelle|      married|
|   Trump|     Ivanka|      married|
| Clinton|     G.Bush|        rival|
| Clinton|       Dole|        rival|
|    Bush|    Al Gore|        rival|
|    Bush|      Kerry|        rival|
|   Obama|     McCain|        rival|
|   Obama|     Romney|        rival|
|   Trump|    Hillary|        rival|
| Clinton|       Bush|    succeeded|
|    Bush|      Obama|    succeeded|
|   Obama|      Trump|    succeeded|
| Hillary|      Laura|        knows|
|   Laura|   Michelle|        knows|
|Michelle|     Ivanka|        knows|
| Hillary|   Michelle|        knows|
|   Kerry|  J. Thorne|      married|
| Al Gore|T.Aitcheson|      married|
+--------+-----------+-------------+
only showing top 20 rows



In [73]:
# Import the data from CSV
us_age = spark.read.option("header", True).csv("us_age.csv")
us_age.show()

+------------+---+
|        name|age|
+------------+---+
|     Clinton| 75|
|        Bush| 75|
|       Obama| 60|
|       Trump| 75|
|     Hillary| 74|
|       Laura| 75|
|    Michelle| 58|
|       Kerry| 78|
|     Al Gore| 74|
|      McCain| 81|
|     G.Bush | 94|
|        Dole| 98|
|      Romney| 75|
|      Ivanka| 40|
|   J. Thorne| 61|
|T. Aitcheson| 74|
|       Cindy| 67|
+------------+---+



In [74]:
# Give us_age an "id"
us_age_id = us_age.withColumn("id", us_age.name)
us_age_id.show()

+------------+---+------------+
|        name|age|          id|
+------------+---+------------+
|     Clinton| 75|     Clinton|
|        Bush| 75|        Bush|
|       Obama| 60|       Obama|
|       Trump| 75|       Trump|
|     Hillary| 74|     Hillary|
|       Laura| 75|       Laura|
|    Michelle| 58|    Michelle|
|       Kerry| 78|       Kerry|
|     Al Gore| 74|     Al Gore|
|      McCain| 81|      McCain|
|     G.Bush | 94|     G.Bush |
|        Dole| 98|        Dole|
|      Romney| 75|      Romney|
|      Ivanka| 40|      Ivanka|
|   J. Thorne| 61|   J. Thorne|
|T. Aitcheson| 74|T. Aitcheson|
|       Cindy| 67|       Cindy|
+------------+---+------------+



In [75]:
g = GraphFrame(us_age_id, us_edgelist)
print(g)

GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])


In [76]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
display(results.vertices)

DataFrame[name: string, age: string, id: string, pagerank: double]

In [77]:
results.vertices.show()

+------------+---+------------+------------------+
|        name|age|          id|          pagerank|
+------------+---+------------+------------------+
|       Laura| 75|       Laura| 1.260519753901559|
|        Dole| 98|        Dole|0.7170062892276499|
|       Trump| 75|       Trump|0.7026603435868531|
|     Hillary| 74|     Hillary|1.2928228659823873|
|       Kerry| 78|       Kerry|0.7110700358590443|
|    Michelle| 58|    Michelle| 2.323551852445693|
|      Romney| 75|      Romney|0.7026603435868531|
|     Al Gore| 74|     Al Gore|0.7110700358590443|
|      McCain| 81|      McCain|0.7026603435868531|
|        Bush| 75|        Bush|0.7170062892276499|
|T. Aitcheson| 74|T. Aitcheson|0.5587061993981687|
|     Clinton| 75|     Clinton|0.5587061993981687|
|   J. Thorne| 61|   J. Thorne|1.1345227761529062|
|       Obama| 60|       Obama|0.7110700358590443|
|      Ivanka| 40|      Ivanka| 2.502737660377051|
|     G.Bush | 94|     G.Bush |0.5587061993981687|
|       Cindy| 67|       Cindy|

## Find the top 3 most influential persons based on the PageRank result.

In [78]:
df_result = results.vertices
df_result.sort(df_result.pagerank.desc()).show(3)

+--------+---+--------+------------------+
|    name|age|      id|          pagerank|
+--------+---+--------+------------------+
|  Ivanka| 40|  Ivanka| 2.502737660377051|
|Michelle| 58|Michelle| 2.323551852445693|
| Hillary| 74| Hillary|1.2928228659823873|
+--------+---+--------+------------------+
only showing top 3 rows



## Find the person who has: a) highest in degree values, b) highest out-degree values

In [79]:
df_in_out = g.inDegrees.join(g.outDegrees,on="id",how="full")

# a) highest in degree values
print('highest in degree values')
df_in_out.sort(df_in_out.inDegree.desc()).show(1)

highest in degree values
+--------+--------+---------+
|      id|inDegree|outDegree|
+--------+--------+---------+
|Michelle|       3|        1|
+--------+--------+---------+
only showing top 1 row



In [80]:
# b) highest out degree values
print('highest out degree values')
df_in_out.sort(df_in_out.outDegree.desc()).show(1)

highest out degree values
+----+--------+---------+
|  id|inDegree|outDegree|
+----+--------+---------+
|Bush|       1|        4|
+----+--------+---------+
only showing top 1 row



## Find the shortest path from “Ivanka” to “G. Bush.”

In [81]:
paths = g.bfs("id = 'Ivanka'", "id = 'G. Bush'")
paths.show()

+----+---+---+
|name|age| id|
+----+---+---+
+----+---+---+

