# GraphFrames Test Notebook

This notebook tests the installation and basic functionality of GraphFrames.

In [1]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame
import os
import sys
from pyspark.sql import functions as F

# 1. Configure Spark with the Spark 4.0 GraphFrames JAR
#GRAPH_FRAMES_PACKAGE = "io.graphframes:graphframes-spark4_2.13:0.10.0"

#spark = SparkSession.builder \
#    .appName("GraphFrames Test") \
#    .config("spark.jars.packages", GRAPH_FRAMES_PACKAGE) \
#    .getOrCreate()

spark = SparkSession.builder \
    .appName("GraphFrames Test") \
    .config("spark.jars.repositories", "https://repos.spark-packages.org") \
    .config("spark.jars.packages", "io.graphframes:graphframes-spark4_2.13:0.10.0") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Scala Version: {spark.sparkContext._gateway.jvm.scala.util.Properties.versionString()}")



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/20 19:07:26 WARN Utils: Your hostname, Bamdad-Beast, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/11/20 19:07:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
https://repos.spark-packages.org added as a remote repository with the name: repo-1
:: loading settings :: url = jar:file:/home/kaveh/projects/spark-shortest-path/venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/kaveh/.ivy2.5.2/cache
The jars for the packages stored in: /home/kaveh/.ivy2.5.2/jars
io.graphframes#graphframes-spark4_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-136faccc-dc93-451c-aa5f-fc7ec255b760;1.0
	confs: [default]
	found io.graphframes#graphframes-spark4_2.13;0.10.0 in central
	found io.graphframes#graphframes-graphx-spa

Spark Version: 4.0.1
Scala Version: version 2.13.16


In [15]:
# 2. Create the Vertices (Users)
v = spark.createDataFrame([
    ("a", "Alice", 34),
    ("b", "Bob", 36),
    ("c", "Charlie", 30),
    ("d", "David", 29),
    ("e", "Esther", 32),
    ("f", "Fanny", 36)
], ["id", "name", "age"])

# 3. Create the Edges (Relationships)
e = spark.createDataFrame([
    ("a", "b", "friend"),
    ("b", "c", "follow"),
    ("c", "b", "follow"),
    ("f", "c", "follow"),
    ("e", "f", "follow"),
    ("e", "d", "friend"),
    ("d", "a", "friend"),
    ("a", "e", "friend")
], ["src", "dst", "relationship"])

# 4. Create the GraphFrame
g = GraphFrame(v, e)

# 5. Run Shortest Paths
results = g.shortestPaths(landmarks=["a", "d"]).run()
results.select("id", "distances").show()

25/11/20 19:13:53 WARN ShortestPaths: Returned DataFrame is persistent and materialized!


PySparkAttributeError: [ATTRIBUTE_NOT_SUPPORTED] Attribute `run` is not supported.

In [3]:
# Create a simple graph
# Vertices DataFrame
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)
], ["id", "name", "age"])

# Edges DataFrame
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
], ["src", "dst", "relationship"])

# Create a GraphFrame
g = GraphFrame(v, e)

In [4]:
# Display vertices and edges
print("Vertices:")
g.vertices.show()

print("Edges:")
g.edges.show()

Vertices:


+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
+---+-------+---+

Edges:
+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
+---+---+------------+



In [5]:
# Run PageRank
print("Running PageRank...")
results = g.pageRank(resetProbability=0.15, maxIter=5)
results.vertices.select("id", "pagerank").orderBy(F.desc("pagerank")).show()

Running PageRank...


25/11/20 19:07:37 WARN ShippableVertexPartitionOps: Joining two VertexPartitions with different indexes is slow.


+---+-------------------+
| id|           pagerank|
+---+-------------------+
|  c| 2.7339209603658534|
|  b|  2.514646227134146|
|  a|0.48915269944105694|
|  e|0.40545134654471543|
|  d|0.34304852959857723|
|  f|0.34304852959857723|
|  g|0.17073170731707318|
+---+-------------------+



25/11/20 19:07:39 WARN PageRank: Returned DataFrame is persistent and materialized!


In [6]:
# Run Connected Components
print("Running Connected Components...")
spark.sparkContext.setCheckpointDir("../src/checkpoints")
cc = g.connectedComponents()
cc.select("id", "component").orderBy("component").show()

Running Connected Components...
+---+------------+
| id|   component|
+---+------------+
|  g|146028888064|
|  c|412316860416|
|  a|412316860416|
|  f|412316860416|
|  d|412316860416|
|  e|412316860416|
|  b|412316860416|
+---+------------+



25/11/20 19:07:45 WARN ConnectedComponents$: Returned DataFrame is persistent and materialized!


In [7]:
# Motif Finding
print("Motif Finding (a -> b -> c)...")
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(c)")
motifs.show()

Motif Finding (a -> b -> c)...
+----------------+--------------+----------------+--------------+----------------+
|               a|             e|               b|            e2|               c|
+----------------+--------------+----------------+--------------+----------------+
| {e, Esther, 32}|{e, d, friend}|  {d, David, 29}|{d, a, friend}|  {a, Alice, 34}|
|  {d, David, 29}|{d, a, friend}|  {a, Alice, 34}|{a, b, friend}|    {b, Bob, 36}|
|  {f, Fanny, 36}|{f, c, follow}|{c, Charlie, 30}|{c, b, follow}|    {b, Bob, 36}|
|    {b, Bob, 36}|{b, c, follow}|{c, Charlie, 30}|{c, b, follow}|    {b, Bob, 36}|
|{c, Charlie, 30}|{c, b, follow}|    {b, Bob, 36}|{b, c, follow}|{c, Charlie, 30}|
|  {a, Alice, 34}|{a, b, friend}|    {b, Bob, 36}|{b, c, follow}|{c, Charlie, 30}|
| {e, Esther, 32}|{e, f, follow}|  {f, Fanny, 36}|{f, c, follow}|{c, Charlie, 30}|
|  {a, Alice, 34}|{a, e, friend}| {e, Esther, 32}|{e, d, friend}|  {d, David, 29}|
|  {d, David, 29}|{d, a, friend}|  {a, Alice, 34}|{a, e,