In [None]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark==3.0.0
!pip install graphframes


Collecting pyspark==3.0.0
  Downloading pyspark-3.0.0.tar.gz (204.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.7/204.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9 (from pyspark==3.0.0)
  Downloading py4j-0.10.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.0-py2.py3-none-any.whl size=205044158 sha256=bde5d814fbc5a8665beb38c5ca38b1e4174a64f38ac0abde3190b57b42a5d8a5
  Stored in directory: /root/.cache/pip/wheels/b1/bb/8b/ca24d3f756f2ed967225b0871898869db676eb5846df5adc56
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attem



In [None]:


# Step 2: Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.10/dist-packages/pyspark"

# Step 3: Create SparkSession
from pyspark.sql import SparkSession
from graphframes import GraphFrame
from pyspark.sql.functions import desc

# Create SparkSession with GraphFrames
spark = SparkSession.builder \
    .appName("Graph Analytics") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
    .getOrCreate()

# Step 4: Create Vertices DataFrame
vertices = spark.createDataFrame([
    ("Alice", 45),
    ("Jacob", 43),
    ("Roy", 21),
    ("Ryan", 49),
    ("Emily", 24),
    ("Sheldon", 52)
], ["id", "age"])

# Step 5: Create Edges DataFrame
edges = spark.createDataFrame([
    ("Sheldon", "Alice", "Sister"),
    ("Alice", "Jacob", "Husband"),
    ("Emily", "Jacob", "Father"),
    ("Ryan", "Alice", "Friend"),
    ("Alice", "Emily", "Daughter"),
    ("Jacob", "Roy", "Son"),
    ("Roy", "Ryan", "Son")
], ["src", "dst", "relation"])

# Step 6: Create a GraphFrame
graph = GraphFrame(vertices, edges)

# Step 7: Graph Analytics
print("Grouped and ordered edges:")
graph.edges.groupBy("src", "dst").count().orderBy(desc("count")).show()

print("Filtered edges where src or dst is 'Alice':")
graph.edges.where("src = 'Alice' OR dst = 'Alice'").groupBy("src", "dst").count().orderBy(desc("count")).show()

print("Subgraph where src or dst is 'Alice':")
subgraph_edges = graph.edges.where("src = 'Alice' OR dst = 'Alice'")
subgraph = GraphFrame(graph.vertices, subgraph_edges)
subgraph.edges.show()

print("Finding motifs in the graph:")
motifs = graph.find("(a) - [ab] -> (b)")
motifs.show()

print("Calculating PageRank:")
rank = graph.pageRank(resetProbability=0.15, maxIter=5)
rank.vertices.orderBy(desc("pagerank")).show()

print("In-Degree of nodes:")
in_degree = graph.inDegrees
in_degree.orderBy(desc("inDegree")).show()

print("Out-Degree of nodes:")
out_degree = graph.outDegrees
out_degree.orderBy(desc("outDegree")).show()

print("Finding connected components:")
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")  # Required for connected components
cc = graph.connectedComponents()
cc.show()

print("Finding strongly connected components:")
scc = graph.stronglyConnectedComponents(maxIter=5)
scc.show()

print("Performing BFS from node 'Alice' to node 'Roy':")
bfs_result = graph.bfs(fromExpr="id = 'Alice'", toExpr="id = 'Roy'", maxPathLength=2)
bfs_result.show()

# Stop the Spark session
spark.stop()


Grouped and ordered edges:
+-------+-----+-----+
|    src|  dst|count|
+-------+-----+-----+
|  Alice|Jacob|    1|
|Sheldon|Alice|    1|
|  Emily|Jacob|    1|
|  Alice|Emily|    1|
|  Jacob|  Roy|    1|
|   Ryan|Alice|    1|
|    Roy| Ryan|    1|
+-------+-----+-----+

Filtered edges where src or dst is 'Alice':
+-------+-----+-----+
|    src|  dst|count|
+-------+-----+-----+
|  Alice|Jacob|    1|
|Sheldon|Alice|    1|
|  Alice|Emily|    1|
|   Ryan|Alice|    1|
+-------+-----+-----+

Subgraph where src or dst is 'Alice':
+-------+-----+--------+
|    src|  dst|relation|
+-------+-----+--------+
|Sheldon|Alice|  Sister|
|  Alice|Jacob| Husband|
|   Ryan|Alice|  Friend|
|  Alice|Emily|Daughter|
+-------+-----+--------+

Finding motifs in the graph:
+-------------+--------------------+-----------+
|            a|                  ab|          b|
+-------------+--------------------+-----------+
|  {Jacob, 43}|   {Jacob, Roy, Son}|  {Roy, 21}|
|  {Emily, 24}|{Emily, Jacob, Fa...|{Jacob, 4