In [None]:
!pip install pyspark

!pip install graphframes

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=c50c8fb78284662bd9ea755927363962afb4488133668ffb5402703eecf26e51
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3
Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-n

In [None]:
# Import Libraries
from pyspark.sql import SparkSession
from graphframes import GraphFrame
from pyspark.sql.functions import desc

# Initialize Spark Session with GraphFrames package
spark = SparkSession.builder \
    .appName("Graph Analytics") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()

# Create vertices and edges DataFrames
vertices_data = [
    ("Alice", 45),
    ("Jacob", 43),
    ("Roy", 21),
    ("Ryan", 49),
    ("Emily", 24),
    ("Sheldon", 52)
]
edges_data = [
    ("Sheldon", "Alice", "Sister"),
    ("Alice", "Jacob", "Husband"),
    ("Emily", "Jacob", "Father"),
    ("Ryan", "Alice", "Friend"),
    ("Alice", "Emily", "Daughter"),
    ("Alice", "Roy", "Son"),
    ("Jacob", "Roy", "Son")
]

# Create DataFrames
vertices_df = spark.createDataFrame(vertices_data, ["id", "age"])
edges_df = spark.createDataFrame(edges_data, ["src", "dst", "relationship"])

# Create GraphFrame
graph = GraphFrame(vertices_df, edges_df)

# 1. Grouped and Ordered Edges
print("Grouped and Ordered Edges:")
graph.edges.groupBy("src", "dst").count().orderBy(desc("count")).show(truncate=False)

# 2. Filtered Edges where src or dst is 'Alice'
print("Filtered Edges where src or dst is 'Alice':")
graph.edges.where("src = 'Alice' OR dst = 'Alice'").groupBy("src", "dst").count().orderBy(desc("count")).show(truncate=False)

# 3. Subgraph where 'Alice' is involved
print("Subgraph where 'Alice' is involved:")
subgraph_edges = graph.edges.where("src = 'Alice' OR dst = 'Alice'")
subgraph = GraphFrame(graph.vertices, subgraph_edges)
subgraph.edges.show(truncate=False)

# 4. Motifs in the Graph (connections involving Alice)
print("Motifs in the Graph (connections involving Alice):")
motifs = graph.find("(a)-[ab]->(b)")
motifs_filtered = motifs.filter("ab.relationship = 'Friend' OR ab.relationship = 'Daughter'")
motifs_filtered.show(truncate=False)

# 5. PageRank Results
print("PageRank Results:")
page_rank = graph.pageRank(resetProbability=0.15, maxIter=5)
page_rank.vertices.orderBy(desc("pagerank")).show(truncate=False)

# 6. In-Degree of Each Vertex
print("In-Degree of Each Vertex:")
in_degree = graph.inDegrees
in_degree.orderBy(desc("inDegree")).show(truncate=False)

# 7. Connected Components
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")  # Set checkpoint directory
print("Connected Components:")
connected_components = graph.connectedComponents()
connected_components.show(truncate=False)

# 8. Strongly Connected Components
print("Strongly Connected Components:")
strongly_connected_components = graph.stronglyConnectedComponents(maxIter=5)
strongly_connected_components.show(truncate=False)

# 9. Breadth-First Search (BFS)
print("Breadth-First Search (BFS):")
bfs_results = graph.bfs(fromExpr="id = 'Alice'", toExpr="id = 'Roy'", maxPathLength=2)
bfs_results.show(truncate=False)

# Stop the Spark session
spark.stop()



Grouped and Ordered Edges:
+-------+-----+-----+
|src    |dst  |count|
+-------+-----+-----+
|Alice  |Jacob|1    |
|Sheldon|Alice|1    |
|Emily  |Jacob|1    |
|Alice  |Emily|1    |
|Alice  |Roy  |1    |
|Jacob  |Roy  |1    |
|Ryan   |Alice|1    |
+-------+-----+-----+

Filtered Edges where src or dst is 'Alice':
+-------+-----+-----+
|src    |dst  |count|
+-------+-----+-----+
|Alice  |Jacob|1    |
|Sheldon|Alice|1    |
|Alice  |Emily|1    |
|Alice  |Roy  |1    |
|Ryan   |Alice|1    |
+-------+-----+-----+

Subgraph where 'Alice' is involved:
+-------+-----+------------+
|src    |dst  |relationship|
+-------+-----+------------+
|Sheldon|Alice|Sister      |
|Alice  |Jacob|Husband     |
|Ryan   |Alice|Friend      |
|Alice  |Emily|Daughter    |
|Alice  |Roy  |Son         |
+-------+-----+------------+

Motifs in the Graph (connections involving Alice):




+-----------+------------------------+-----------+
|a          |ab                      |b          |
+-----------+------------------------+-----------+
|{Ryan, 49} |{Ryan, Alice, Friend}   |{Alice, 45}|
|{Alice, 45}|{Alice, Emily, Daughter}|{Emily, 24}|
+-----------+------------------------+-----------+

PageRank Results:
+-------+---+-------------------+
|id     |age|pagerank           |
+-------+---+-------------------+
|Roy    |21 |1.9089989375092518 |
|Jacob  |43 |1.3728466605994618 |
|Alice  |45 |1.135192093597289  |
|Emily  |24 |0.7420792759997091 |
|Sheldon|52 |0.42044151614714403|
|Ryan   |49 |0.42044151614714403|
+-------+---+-------------------+

In-Degree of Each Vertex:
+-----+--------+
|id   |inDegree|
+-----+--------+
|Jacob|2       |
|Alice|2       |
|Roy  |2       |
|Emily|1       |
+-----+--------+

Connected Components:
+-------+---+------------+
|id     |age|component   |
+-------+---+------------+
|Alice  |45 |197568495616|
|Jacob  |43 |197568495616|
|Roy    |21 |1