In [1]:
from graphframes import *

In [2]:
# Vertex DataFrame
v = sqlContext.createDataFrame([
  ("a", "Ronaldo", 32,230,345,87,30,3),
  ("b", "Rooney", 32,278,260,98,31,6),
  ("c", "Beckham", 42,440,167,114,23,7),
  ("d", "Giigs", 43,702,223,145,43,1),
  ("e", "Alexis", 29,444,172,74,23,7),
  ("f", "Mata", 28,332,72,98,41,5),
  ("g", "Rashford", 20,103,67,32,12,0),
  ("h", "Lukaku", 24,253,97,62,19,3),
  ("i", "Martial", 22,102,57,45,12,0),
  ("j", "Fellaini", 27,215,31,32,32,6),
  ("k", "Carrick", 35,535,47,82,29,13),
  ("l", "Matic", 29,313,15,32,34,9),
  ("m", "Pogba", 24,153,47,61,17,2),
  ("n", "Smalling", 27,232,17,6,32,9),
  ("o", "Jones", 27,233,11,15,21,6),
  ("p", "Shaw", 20,143,7,21,18,1),
  ("q", "Herrera", 26,151,17,32,16,1)
], ["id", "name", "age","matches","goals","assists","YC","RC"])
# Edge DataFrame
e = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend"),
  ("b", "e", "friend"),
  ("f", "p", "friend"),
  ("k", "m", "friend"),
  ("l", "o", "friend"),
  ("b", "d", "friend"),
  ("c", "p", "friend"),
  ("q", "g", "friend"),
  ("i", "l", "friend"),
  ("b", "o", "friend"),
  ("a", "q", "friend"),
  ("b", "p", "friend"),
  ("c", "q", "friend"),
  ("d", "g", "friend"),
  ("p", "p", "friend"),
  ("i", "p", "friend"),
  ("k", "d", "follow"),
  ("b", "k", "follow"),
  ("q", "e", "follow"),
  ("f", "q", "follow"),
  ("k", "f", "follow"),
  ("f", "b", "follow"),
  ("l", "f", "follow"),
  ("m", "n", "follow"),
  ("n", "q", "follow"),
  ("q", "m", "follow"),
  ("g", "n", "follow")
  
], ["src", "dst", "relationship"])
# Create a GraphFrame
g = GraphFrame(v, e)

df1=sqlContext.sql("Select id, radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean from cancer")
df2=sqlContext.sql("Select count(id) as id,diagnosis from cancer group by diagnosis")
df2.createTempView("e")
edges=sqlContext.sql("Select cancer.id as src,e.id as dst,cancer.diagnosis from cancer inner join e on cancer.diagnosis = e.diagnosis")

display(df1)

display(edges)

g=GraphFrame(df1,edges)

In [7]:
print g

In [8]:
display(g.vertices)

In [9]:
display(g.edges)

In [10]:
display(g.inDegrees)

In [11]:
display(g.outDegrees)

In [12]:
display(g.degrees)

In [13]:
minimum_goals = g.vertices.groupBy().min("goals")
maximum_goals= g.vertices.groupBy().max("goals")
display(maximum_goals)

In [14]:
display(minimum_goals)

In [16]:
numFriends = g.edges.filter("relationship='friend'").count()
numFollow= g.edges.filter("relationship='follow'").count()
print "The number of friends edges is", numFriends
print "The number of follow edges is", numFollow

In [17]:
#from graphframes.examples import Graphs
#g = Graphs(sqlContext).friends()  # Get example graph

# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(c)")
motifs.show()

In [18]:
# More complex queries can be expressed by applying filters.
motifs.filter("b.goals > 50").show()

In [19]:
from pyspark.sql.functions import col, lit, udf, when
from pyspark.sql.types import IntegerType
#from graphframes.examples import Graphs
#g = Graphs(sqlContext).friends()  # Get example graph

chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")

# Query on sequence, with state (cnt)
#  (a) Define method for updating state given the next element of the motif.
sumFriends =\
  lambda cnt,relationship: when(relationship == "friend", cnt+1).otherwise(cnt)
#  (b) Use sequence operation to apply method to sequence of elements in motif.
#      In this case, the elements are the 3 edges.
condition =\
  reduce(lambda cnt,e: sumFriends(cnt, col(e).relationship), ["ab", "bc", "cd"], lit(0))
#  (c) Apply filter to DataFrame.
chainWith2Friends2 = chain4.where(condition >= 2)
chainWith2Friends2.show()

In [20]:
paths = g.find("(a)-[e]->(b)")\
  .filter("e.relationship = 'follow'")\
  .filter("a.age < b.age")
# The `paths` variable contains the vertex information, which we can extract:
e2 = paths.select("e.src", "e.dst", "e.relationship")

# In Spark 1.5+, the user may simplify the previous call to:
# val e2 = paths.select("e.*")

# Construct the subgraph
g2 = GraphFrame(g.vertices, e2)
display(g2.vertices)

In [21]:
paths = g.bfs("name = 'Rooney'", "age < 35")
display(paths)

In [22]:
filteredPaths = g.bfs(
  fromExpr = "name = 'Ronaldo'",
  toExpr = "age < 34",
  edgeFilter = "relationship != 'friend'",
  maxPathLength = 3)
display(filteredPaths)

In [23]:
result = g.stronglyConnectedComponents(maxIter=5)
display(result.select("id", "component"))

In [24]:
result = g.labelPropagation(maxIter=2)
display(result)

In [25]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
display(results.vertices)

In [26]:
display(results.edges)

In [27]:
results = g.shortestPaths(landmarks=["a", "q"])
display(results)