In [5]:
from functools import reduce
from pyspark.sql.functions import col, lit, when, explode
from graphframes import *
from pyspark.sql import *
import os

In [6]:
school_name = "Caltech36"
input_dir = "/Users/shuozhang/Desktop/Big_Data_Technology/Emerging_Tech_In_Big_Data/Project/facebook_data/"
bio_file_name = school_name + "_bio.csv"
friend_file_name = school_name + "_friend.csv"
bio_file_path = os.path.join(input_dir, bio_file_name)
friend_file_path = os.path.join(input_dir, friend_file_name)

In [7]:
# create a graph
vertices = spark.read.format("csv").option("header", "true").load(bio_file_path)
edges = spark.read.format("csv").option("header", "true").load(friend_file_path)
g = GraphFrame(vertices, edges)
print(g)

GraphFrame(v:[id: string, student/faculty_status_flag: string ... 6 more fields], e:[src: string, dst: string ... 1 more field])


In [8]:
# display vertices and edges
g.vertices.show(5)
g.edges.show(5)

+---+---------------------------+------+-----+-----+----------+----+-----------+
| id|student/faculty_status_flag|gender|major|minor|dorm/house|year|high_school|
+---+---------------------------+------+-----+-----+----------+----+-----------+
|  0|                          1|     1|  199|    0|       169|2008|       3387|
|  1|                          1|     1|  199|    0|       165|2006|       3172|
|  2|                          1|     2|  201|    0|       171|2008|       9773|
|  3|                          2|     2|  208|  210|       172|2005|      50578|
|  4|                          1|     2|  202|    0|       169|2008|      11880|
+---+---------------------------+------+-----+-----+----------+----+-----------+
only showing top 5 rows

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  0|  4|      friend|
|  0| 30|      friend|
|  0| 35|      friend|
|  0| 38|      friend|
|  0| 42|      friend|
+---+---+------------+
only showing top 5 rows



In [9]:
# incoming and outgoing degree of edges
g.inDegrees.show(5)
g.outDegrees.show(5)

+---+--------+
| id|inDegree|
+---+--------+
|675|      39|
|691|      34|
|467|      29|
|296|       1|
|125|      38|
+---+--------+
only showing top 5 rows

+---+---------+
| id|outDegree|
+---+---------+
|296|        1|
|467|       29|
|675|       39|
|691|       34|
|125|       38|
+---+---------+
only showing top 5 rows



## Use Case 1: Find Potential Friends

Suppose I just enter Caltech, I want to find people with the same major and come from the same high school as me.

In [11]:
potential_friends = g.vertices.filter("major = '199'").filter("high_school = '0'")
potential_friends.show()

+---+---------------------------+------+-----+-----+----------+----+-----------+
| id|student/faculty_status_flag|gender|major|minor|dorm/house|year|high_school|
+---+---------------------------+------+-----+-----+----------+----+-----------+
| 10|                          1|     2|  199|    0|       167|2006|          0|
| 23|                          1|     1|  199|    0|       165|2007|          0|
|133|                          5|     2|  199|    0|         0|2006|          0|
|298|                          2|     2|  199|    0|         0|2005|          0|
|308|                          1|     1|  199|  202|       168|2007|          0|
|312|                          2|     1|  199|    0|       168|2004|          0|
|442|                          1|     2|  199|  200|       168|2005|          0|
|473|                          5|     1|  199|    0|         0|   0|          0|
|517|                          1|     1|  199|    0|       165|2006|          0|
|609|                       

## Use Case 2: Connection Distance Calculation

Silimar to LinkedIn, we can find two people are direct friend (+1 degree), have a mutual friend (+2 degree), or what is the shortest friend path connecting these two people.

In [35]:
results = g.shortestPaths(landmarks=["10"])
results.select("id", explode("distances")).sort('value', ascending=True).show()

+---+---+-----+
| id|key|value|
+---+---+-----+
| 10| 10|    0|
|663| 10|    1|
|341| 10|    1|
|637| 10|    2|
|267| 10|    2|
|614| 10|    2|
|766| 10|    2|
|382| 10|    2|
|322| 10|    2|
|595| 10|    2|
| 35| 10|    2|
|256| 10|    2|
|658| 10|    2|
|125| 10|    2|
|688| 10|    2|
|475| 10|    2|
|390| 10|    2|
|510| 10|    2|
|274| 10|    2|
|694| 10|    2|
+---+---+-----+
only showing top 20 rows



## Use Case 3: Find Most Popular People in School

In [30]:
# PageRank
g.pageRank(resetProbability=0.15, tol=0.01).vertices.sort('pagerank', ascending=False).show(5)

+---+---------------------------+------+-----+-----+----------+----+-----------+------------------+
| id|student/faculty_status_flag|gender|major|minor|dorm/house|year|high_school|          pagerank|
+---+---------------------------+------+-----+-----+----------+----+-----------+------------------+
|708|                          2|     2|  223|    0|         0|2005|      19445| 4.992117377111114|
|222|                          1|     1|  196|  228|       171|2008|       2395|  4.24784817339537|
| 89|                          1|     1|  199|    0|       169|2006|       6895| 4.180237339503565|
|277|                          1|     2|  222|  205|       170|2007|       4423|3.7835907220213256|
|663|                          1|     1|  220|  225|       167|2006|       2868|3.6003923172675925|
+---+---------------------------+------+-----+-----+----------+----+-----------+------------------+
only showing top 5 rows

