In [1]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame

spark = SparkSession.builder.getOrCreate()

vertexesPath = "Spark/Exercises/Ex_56/vertexes.csv"
edgesPath = "Spark/Exercises/Ex_56/edges.csv"
outputPath = "Spark/Exercises/Ex_56/res"

In [2]:
vertexesDF = spark.read.load(path=vertexesPath , format="csv", header=True, inferSchema=True)
edgesDF = spark.read.load(path=edgesPath , format="csv", header=True, inferSchema=True)

In [3]:
filteredEdgesDF = edgesDF.filter("linktype == 'follow' or linktype == 'correlated'")

In [4]:
g = GraphFrame(vertexesDF,filteredEdgesDF)

In [8]:
chainDF = g.find("(v1)-[e1]->(v2) ; (v2)-[e2]->(v3)")
chainDF.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  v1|                  e1|                  v2|                  e2|                  v3|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|   [V1, user, Paolo]|    [V1, V3, follow]|   [V3, user, David]|    [V3, V4, follow]|[V4, topic, Big D...|
|   [V1, user, Paolo]|    [V1, V3, follow]|   [V3, user, David]|    [V3, V2, follow]|    [V2, topic, SQL]|
|   [V1, user, Paolo]|    [V1, V4, follow]|[V4, topic, Big D...|[V4, V2, correlated]|    [V2, topic, SQL]|
|   [V3, user, David]|    [V3, V2, follow]|    [V2, topic, SQL]|[V2, V4, correlated]|[V4, topic, Big D...|
|   [V3, user, David]|    [V3, V4, follow]|[V4, topic, Big D...|[V4, V2, correlated]|    [V2, topic, SQL]|
|    [V2, topic, SQL]|[V2, V4, correlated]|[V4, topic, Big D...|[V4, V2, correlated]|    [V2, topic, SQL]|
|[V4, topic, Big D...|[V4, V2, correl

In [12]:
filteredChainDF = chainDF.filter("""v1.entityName == 'user' and e1.linktype == 'follow' and v2.entityName == 'topic' and e2.linktype == 'correlated' 
                                 and v3.entityName == 'topic' and v3.name == 'Big Data'""")
filteredChainDF.show()

+-----------------+----------------+----------------+--------------------+--------------------+
|               v1|              e1|              v2|                  e2|                  v3|
+-----------------+----------------+----------------+--------------------+--------------------+
|[V3, user, David]|[V3, V2, follow]|[V2, topic, SQL]|[V2, V4, correlated]|[V4, topic, Big D...|
+-----------------+----------------+----------------+--------------------+--------------------+



In [10]:
finalDF = filteredChainDF.selectExpr("v1.name as username")
finalDF.show()

+--------+
|username|
+--------+
|   David|
+--------+



In [11]:
finalDF.write.csv(outputPath,header=True)