## The following section is for Colab Users.
### Just run the following code cells

In [1]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://bitbucket.org/habedi/datasets/raw/b6769c4664e7ff68b001e2f43bc517888cbe3642/spark/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!rm -rf spark-3.0.2-bin-hadoop2.7.tgz*
!pip -q install findspark pyspark graphframes

In [2]:
!wget https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.0-s_2.12/graphframes-0.8.2-spark3.0-s_2.12.jar -P /content/spark-3.0.2-bin-hadoop2.7/jars/
!cp /content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar /content/spark-3.0.2-bin-hadoop2.7/graphframes-0\\graphframes\\graphframes\\0.8.2-spark3.0-s_2.12\\graphframes-0.8.2-spark3.0-s_2.12.jar.8.2-spark3.0-s_2.12.zip

--2023-06-05 21:10:15--  https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.0-s_2.12/graphframes-0.8.2-spark3.0-s_2.12.jar
Resolving repos.spark-packages.org (repos.spark-packages.org)... 99.84.160.25, 99.84.160.20, 99.84.160.66, ...
Connecting to repos.spark-packages.org (repos.spark-packages.org)|99.84.160.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 247882 (242K) [binary/octet-stream]
Saving to: ‘/content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar.1’


2023-06-05 21:10:16 (7.43 MB/s) - ‘/content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar.1’ saved [247882/247882]



In [3]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = os.environ["SPARK_HOME"]

os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

In [4]:
import findspark
findspark.init()

In [5]:
!export PYSPARK_SUBMIT_ARGS="--master local[*] pyspark-shell"
!export PYSPARK_DRIVER_PYTHON=jupyter
!export PYSPARK_DRIVER_PYTHON_OPTS=notebook

In [6]:
from pyspark.sql import SparkSession
from graphframes import *

spark = SparkSession.builder.master("local[*]").appName("GraphFrames").getOrCreate()

In [7]:
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell"

**************************************************************************
**************************************************************************
**************************************************************************

In [8]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Read departuredelays.csv in Edge DataFrame
### Read airport-codes-na.txt in Vertix DataFrame (the separator is Tab i.e sep = '\t' )

#### The US flight delays data set has five columns:
- The <b>date</b> column contains an integer like 02190925 . When converted, this maps to 02-19 09:25 am.
- The <b>delay</b> column gives the delay in minutes between the scheduled and actual departure times. Early departures show negative numbers.
- The <b>distance</b> column gives the distance in miles from the origin airport to the destination airport.
- The <b>origin</b> column contains the origin IATA airport code.
- The <b>destination</b> column contains the destination IATA airport code.

#### The airport-codes data set has four columns:
- The <b>IATA</b> column contains IATA airport code.
- The <b>City, State, and Country</b> columns contains information about the airport location. 

In [9]:
edge = spark.read.csv('/content/departuredelays.csv',header=True,inferSchema=True)
ver = spark.read.csv('/content/airport-codes-na.txt',sep='\t',header=True,inferSchema=True)
ver.show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Abbotsford|   BC| Canada| YXX|
|   Aberdeen|   SD|    USA| ABR|
|    Abilene|   TX|    USA| ABI|
|      Akron|   OH|    USA| CAK|
|    Alamosa|   CO|    USA| ALS|
|     Albany|   GA|    USA| ABY|
|     Albany|   NY|    USA| ALB|
|Albuquerque|   NM|    USA| ABQ|
| Alexandria|   LA|    USA| AEX|
|  Allentown|   PA|    USA| ABE|
|   Alliance|   NE|    USA| AIA|
|     Alpena|   MI|    USA| APN|
|    Altoona|   PA|    USA| AOO|
|   Amarillo|   TX|    USA| AMA|
|Anahim Lake|   BC| Canada| YAA|
|  Anchorage|   AK|    USA| ANC|
|   Appleton|   WI|    USA| ATW|
|     Arviat|  NWT| Canada| YEK|
|  Asheville|   NC|    USA| AVL|
|      Aspen|   CO|    USA| ASE|
+-----------+-----+-------+----+
only showing top 20 rows



### In the vertix DataFrame, drop any duplicated rows with the same  IATA code.

In [10]:
ver2 = ver.drop_duplicates(subset=['IATA'])

### In the edges DataFrame:
- Rename the <b>date</b> columns to become <b>tripid</b>.
- Rename the <b>origin</b> columns to become <b>src</b>.
- Rename the <b>destination</b> columns to become <b>dst</b>.

In [11]:
edge.columns

['date', 'delay', 'distance', 'origin', 'destination']

In [12]:
ver2.columns

['City', 'State', 'Country', 'IATA']

In [13]:
edge2 = edge.withColumnRenamed('origin','src')
edge2 = edge2.withColumnRenamed('destination','dst')
edge2 = edge2.withColumnRenamed('date','tripid')

In [14]:
edge2.columns

['tripid', 'delay', 'distance', 'src', 'dst']

### In the Vertix DataFrame:
- Rename the <b>IATA</b> columns to become <b>id</b>.

In [15]:
ver2 = ver2.withColumnRenamed('IATA','id')

### Create GraphFrame from Vertix and Edges DataFrames

In [16]:
ver2.show(5)

+-------------------+-----+-------+---+
|               City|State|Country| id|
+-------------------+-----+-------+---+
|         Binghamton|   NY|    USA|BGM|
|            Lebanon|   NH|    USA|LEB|
|           Montreal|   PQ| Canada|YUL|
|         Dillingham|   AK|    USA|DLG|
|International Falls|   MN|    USA|INL|
+-------------------+-----+-------+---+
only showing top 5 rows



In [17]:
edge2.show(5)

+-------+-----+--------+---+---+
| tripid|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1011245|    6|     602|ABE|ATL|
|1020600|   -8|     369|ABE|DTW|
|1021245|   -2|     602|ABE|ATL|
|1020605|   -4|     602|ABE|ATL|
|1031245|   -4|     602|ABE|ATL|
+-------+-----+--------+---+---+
only showing top 5 rows



In [18]:
g = GraphFrame(ver2,edge2)

### Determine the number of airports

In [48]:
air= g.vertices.count()
air

524

In [49]:
air= g.vertices.distinct().count()
air

524

### Determine the number of trips 

In [56]:
trips= g.edges.count()
trips

1391578

### What is the longest delay?

In [82]:
from pyspark.sql.functions import *
import pyspark.sql.functions as f

In [60]:
g.edges.select(max(col('delay'))).show()

+----------+
|max(delay)|
+----------+
|      1642|
+----------+



### Find out the number of delayed flights vs. early flights (flights that departed before actual time)

In [67]:
delayed_flight=g.edges.filter('delay>0').count()
delayed_flight

591727

In [69]:
early_flights=g.edges.filter('delay<0').count()
early_flights

668729

In [72]:
delayed_flight= g.filterEdges('delay>0')
delayed_flight.edges.count()

591727

### What flight destinations departing SFO are most likely to have significant delays? Select the top 10
#### Hint: you should get the average delay for each destination for trips that depart from SFO only

In [74]:
df = g.find('(v1)-[e1]->(v2)')
df.show(5,truncate=False)

+-------------------------+----------------------------+-----------------------+
|v1                       |e1                          |v2                     |
+-------------------------+----------------------------+-----------------------+
|[Allentown, PA, USA, ABE]|[1011245, 6, 602, ABE, ATL] |[Atlanta, GA, USA, ATL]|
|[Allentown, PA, USA, ABE]|[1020600, -8, 369, ABE, DTW]|[Detroit, MI, USA, DTW]|
|[Allentown, PA, USA, ABE]|[1021245, -2, 602, ABE, ATL]|[Atlanta, GA, USA, ATL]|
|[Allentown, PA, USA, ABE]|[1020605, -4, 602, ABE, ATL]|[Atlanta, GA, USA, ATL]|
|[Allentown, PA, USA, ABE]|[1031245, -4, 602, ABE, ATL]|[Atlanta, GA, USA, ATL]|
+-------------------------+----------------------------+-----------------------+
only showing top 5 rows



In [None]:
y

In [79]:
df2 = df.filter('v1.id = "SFO"')
df2.show(5,truncate=False)

+-----------------------------+-----------------------------+---------------------------+
|v1                           |e1                           |v2                         |
+-----------------------------+-----------------------------+---------------------------+
|[San Francisco, CA, USA, SFO]|[1011250, 55, 2247, SFO, JFK]|[New York, NY, USA, JFK]   |
|[San Francisco, CA, USA, SFO]|[1012230, 0, 2247, SFO, JFK] |[New York, NY, USA, JFK]   |
|[San Francisco, CA, USA, SFO]|[1010705, -7, 2247, SFO, JFK]|[New York, NY, USA, JFK]   |
|[San Francisco, CA, USA, SFO]|[1010620, -3, 2246, SFO, MIA]|[Miami, FL, USA, MIA]      |
|[San Francisco, CA, USA, SFO]|[1010915, -3, 293, SFO, LAX] |[Los Angeles, CA, USA, LAX]|
+-----------------------------+-----------------------------+---------------------------+
only showing top 5 rows



In [80]:
df3 = df2.filter('e1.delay > 0')
df3.show(5,truncate=False)

+-----------------------------+------------------------------+------------------------+
|v1                           |e1                            |v2                      |
+-----------------------------+------------------------------+------------------------+
|[San Francisco, CA, USA, SFO]|[1011250, 55, 2247, SFO, JFK] |[New York, NY, USA, JFK]|
|[San Francisco, CA, USA, SFO]|[1011610, 134, 1273, SFO, DFW]|[Dallas, TX, USA, DFW]  |
|[San Francisco, CA, USA, SFO]|[1012330, 32, 1604, SFO, ORD] |[Chicago, IL, USA, ORD] |
|[San Francisco, CA, USA, SFO]|[1011330, 3, 1273, SFO, DFW]  |[Dallas, TX, USA, DFW]  |
|[San Francisco, CA, USA, SFO]|[1011410, 124, 1604, SFO, ORD]|[Chicago, IL, USA, ORD] |
+-----------------------------+------------------------------+------------------------+
only showing top 5 rows



In [87]:
df4 = df3.groupBy('v2.id').agg(f.avg('e1.delay').alias('avg')).orderBy('avg',ascending=False)
df4.show(10)

+---+------------------+
| id|               avg|
+---+------------------+
|OKC|59.073170731707314|
|JAC| 57.13333333333333|
|COS|53.976190476190474|
|OTH| 48.09090909090909|
|SAT|            47.625|
|MOD| 46.80952380952381|
|SUN|46.723404255319146|
|CIC| 46.72164948453608|
|ABQ|           44.8125|
|ASE|44.285714285714285|
+---+------------------+
only showing top 10 rows



### Find the Incoming connections to the airport sorted in Desc. order.

In [88]:
incoming = g.inDegrees
incoming.show(5)

+---+--------+
| id|inDegree|
+---+--------+
|PSE|     192|
|INL|      89|
|MSY|   10283|
|PPG|      27|
|GEG|    2043|
+---+--------+
only showing top 5 rows



In [89]:
incoming1 = incoming.orderBy('inDegree' , ascending=False)
incoming1.show(5)

+---+--------+
| id|inDegree|
+---+--------+
|ATL|   90434|
|DFW|   66050|
|ORD|   61967|
|LAX|   53601|
|DEN|   50921|
+---+--------+
only showing top 5 rows



### Find the Outgoing connections from the airport sorted in Desc. order.

In [90]:
Outgoing = g.outDegrees
Outgoing.show(5)

+---+---------+
| id|outDegree|
+---+---------+
|MSY|    10277|
|GEG|     2044|
|BUR|     5079|
|SNA|     9411|
|GRB|     1109|
+---+---------+
only showing top 5 rows



In [92]:
Outgoing1 = Outgoing.orderBy('outDegree' , ascending=False)
Outgoing1.show(5)

+---+---------+
| id|outDegree|
+---+---------+
|ATL|    91484|
|DFW|    68482|
|ORD|    64228|
|LAX|    54086|
|DEN|    53148|
+---+---------+
only showing top 5 rows



### Use motif finding to answer this question: which delays could we blame on SFO?
#### Hint: this practically means that SFO is a transit station

In [94]:
df = g.find('(v1)-[e1]->(v2);(v2)-[e2]->(v3)')
df.show(5,truncate=False)

+-----------------------+----------------------------+---------------------------+-----------------------------+----------------------+
|v1                     |e1                          |v2                         |e2                           |v3                    |
+-----------------------+----------------------------+---------------------------+-----------------------------+----------------------+
|[Atlanta, GA, USA, ATL]|[1012110, -3, 369, ATL, MSY]|[New Orleans, LA, USA, MSY]|[1011335, -4, 389, MSY, DFW] |[Dallas, TX, USA, DFW]|
|[Atlanta, GA, USA, ATL]|[1012110, -3, 369, ATL, MSY]|[New Orleans, LA, USA, MSY]|[1011550, -2, 389, MSY, DFW] |[Dallas, TX, USA, DFW]|
|[Atlanta, GA, USA, ATL]|[1012110, -3, 369, ATL, MSY]|[New Orleans, LA, USA, MSY]|[1011845, -12, 586, MSY, MIA]|[Miami, FL, USA, MIA] |
|[Atlanta, GA, USA, ATL]|[1012110, -3, 369, ATL, MSY]|[New Orleans, LA, USA, MSY]|[1010825, -1, 389, MSY, DFW] |[Dallas, TX, USA, DFW]|
|[Atlanta, GA, USA, ATL]|[1012110, -3, 369, ATL,

In [95]:
df2 = df.filter('v2.id = "SFO"')
df2.show(5,truncate=False)

+---------------------------+----------------------------+-----------------------------+-----------------------------+---------------------------+
|v1                         |e1                          |v2                           |e2                           |v3                         |
+---------------------------+----------------------------+-----------------------------+-----------------------------+---------------------------+
|[Albuquerque, NM, USA, ABQ]|[1010600, -7, 779, ABQ, SFO]|[San Francisco, CA, USA, SFO]|[1011250, 55, 2247, SFO, JFK]|[New York, NY, USA, JFK]   |
|[Albuquerque, NM, USA, ABQ]|[1010600, -7, 779, ABQ, SFO]|[San Francisco, CA, USA, SFO]|[1012230, 0, 2247, SFO, JFK] |[New York, NY, USA, JFK]   |
|[Albuquerque, NM, USA, ABQ]|[1010600, -7, 779, ABQ, SFO]|[San Francisco, CA, USA, SFO]|[1010705, -7, 2247, SFO, JFK]|[New York, NY, USA, JFK]   |
|[Albuquerque, NM, USA, ABQ]|[1010600, -7, 779, ABQ, SFO]|[San Francisco, CA, USA, SFO]|[1010620, -3, 2246, SFO, MIA]|

In [None]:
df3 = df2.filter('e2.delay > 0 ')
df3.show(5,truncate=False)

+-----------------------------+------------------------------+------------------------+
|v1                           |e1                            |v2                      |
+-----------------------------+------------------------------+------------------------+
|[San Francisco, CA, USA, SFO]|[1011250, 55, 2247, SFO, JFK] |[New York, NY, USA, JFK]|
|[San Francisco, CA, USA, SFO]|[1011610, 134, 1273, SFO, DFW]|[Dallas, TX, USA, DFW]  |
|[San Francisco, CA, USA, SFO]|[1012330, 32, 1604, SFO, ORD] |[Chicago, IL, USA, ORD] |
|[San Francisco, CA, USA, SFO]|[1011330, 3, 1273, SFO, DFW]  |[Dallas, TX, USA, DFW]  |
|[San Francisco, CA, USA, SFO]|[1011410, 124, 1604, SFO, ORD]|[Chicago, IL, USA, ORD] |
+-----------------------------+------------------------------+------------------------+
only showing top 5 rows



### Determine Airport Ranking in Desc. order using PageRank algorithm

In [105]:
results = g.pageRank(resetProbability=0.15, maxIter = 2)

In [106]:
results

GraphFrame(v:[id: string, City: string ... 3 more fields], e:[src: string, dst: string ... 4 more fields])

In [None]:
# results.vertices.show()
# results.edges.show()

In [108]:
results.vertices.orderBy('pagerank',ascending=False).show()
results.edges.orderBy('weight',ascending=False).show()

+--------------+-----+-------+---+------------------+
|          City|State|Country| id|          pagerank|
+--------------+-----+-------+---+------------------+
|       Atlanta|   GA|    USA|ATL| 19.80489273417375|
|       Chicago|   IL|    USA|ORD|15.106062117869598|
|        Dallas|   TX|    USA|DFW|15.041753918864057|
|   Los Angeles|   CA|    USA|LAX|12.805019543358503|
|        Denver|   CO|    USA|DEN| 12.67465624721938|
|       Seattle|   WA|    USA|SEA| 9.957583708187082|
| San Francisco|   CA|    USA|SFO| 9.688011610112325|
|       Phoenix|   AZ|    USA|PHX| 9.680846299506165|
|       Houston|   TX|    USA|IAH| 9.473318994071187|
|     Las Vegas|   NV|    USA|LAS| 7.927686939138285|
|Salt Lake City|   UT|    USA|SLC| 7.775227352396656|
|      New York|   NY|    USA|LGA|6.9258987997005494|
|     Charlotte|   NC|    USA|CLT| 6.761900913768643|
|       Orlando|   FL|    USA|MCO| 6.532963470277448|
|   Minneapolis|   MN|    USA|MSP| 6.495358174583235|
|       Detroit|   MI|    US

## Determine the most popular flights (single city hops)

In [19]:
df_pop = g.find('(v1)-[e1]->(v2)')
df_pop.show(5,truncate=False)

+-------------------------+----------------------------+-----------------------+
|v1                       |e1                          |v2                     |
+-------------------------+----------------------------+-----------------------+
|[Allentown, PA, USA, ABE]|[1011245, 6, 602, ABE, ATL] |[Atlanta, GA, USA, ATL]|
|[Allentown, PA, USA, ABE]|[1020600, -8, 369, ABE, DTW]|[Detroit, MI, USA, DTW]|
|[Allentown, PA, USA, ABE]|[1021245, -2, 602, ABE, ATL]|[Atlanta, GA, USA, ATL]|
|[Allentown, PA, USA, ABE]|[1020605, -4, 602, ABE, ATL]|[Atlanta, GA, USA, ATL]|
|[Allentown, PA, USA, ABE]|[1031245, -4, 602, ABE, ATL]|[Atlanta, GA, USA, ATL]|
+-------------------------+----------------------------+-----------------------+
only showing top 5 rows



In [20]:
df_pop2 = df_pop.groupBy(['e1.src','e1.dst']).count()
df_pop2.show(5,truncate=False)

+---+---+-----+
|src|dst|count|
+---+---+-----+
|ATL|GSP|180  |
|DSM|EWR|23   |
|FSD|ATL|30   |
|AUS|ELP|62   |
|BMI|MCO|9    |
+---+---+-----+
only showing top 5 rows



In [21]:
df_pop3 = df_pop2.orderBy('count',ascending=False)
df_pop3.show(5,truncate=False)

+---+---+-----+
|src|dst|count|
+---+---+-----+
|ATL|LGA|850  |
|HNL|OGG|820  |
|ATL|MCO|712  |
|DAL|HOU|704  |
|HOU|DAL|701  |
+---+---+-----+
only showing top 5 rows



### Find and Save a Subragph that obtained from the following pattern:
#### The flight starts from an airport and return back to the same airport through 2 other airports.

In [19]:
# Sample the vertices and edges
sampledVertices = g.vertices.sample(withReplacement=False, fraction=0.2)
sampledEdges = g.edges.sample(withReplacement=False, fraction=0.1)
# Create a new GraphFrame from the sampled vertices and edges
g_new = GraphFrame(sampledVertices, sampledEdges)


In [20]:
g2 = g_new.find('(v1)-[e1]->(v2);(v2)-[e2]->(v3);(v3)-[e3]->(v1)')

g2

DataFrame[v1: struct<City:string,State:string,Country:string,id:string>, e1: struct<tripid:int,delay:int,distance:int,src:string,dst:string>, v2: struct<City:string,State:string,Country:string,id:string>, e2: struct<tripid:int,delay:int,distance:int,src:string,dst:string>, v3: struct<City:string,State:string,Country:string,id:string>, e3: struct<tripid:int,delay:int,distance:int,src:string,dst:string>]

In [21]:
g2.show(5,truncate=False)

+-------------------------+-----------------------------+-----------------------+-----------------------------+------------------------------+-----------------------------+
|v1                       |e1                           |v2                     |e2                           |v3                            |e3                           |
+-------------------------+-----------------------------+-----------------------+-----------------------------+------------------------------+-----------------------------+
|[Anchorage, AK, USA, ANC]|[1010955, 16, 1259, ANC, SEA]|[Seattle, WA, USA, SEA]|[3230845, -7, 2326, SEA, HNL]|[Honolulu, Oahu, HI, USA, HNL]|[3112155, -8, 2413, HNL, ANC]|
|[Anchorage, AK, USA, ANC]|[1010955, 16, 1259, ANC, SEA]|[Seattle, WA, USA, SEA]|[3230845, -7, 2326, SEA, HNL]|[Honolulu, Oahu, HI, USA, HNL]|[3052230, -7, 2413, HNL, ANC]|
|[Anchorage, AK, USA, ANC]|[1010955, 16, 1259, ANC, SEA]|[Seattle, WA, USA, SEA]|[3230845, -7, 2326, SEA, HNL]|[Honolulu, Oahu, HI, USA

In [23]:
g2.printSchema()

root
 |-- v1: struct (nullable = false)
 |    |-- City: string (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- Country: string (nullable = true)
 |    |-- id: string (nullable = true)
 |-- e1: struct (nullable = false)
 |    |-- tripid: integer (nullable = true)
 |    |-- delay: integer (nullable = true)
 |    |-- distance: integer (nullable = true)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- City: string (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- Country: string (nullable = true)
 |    |-- id: string (nullable = true)
 |-- e2: struct (nullable = false)
 |    |-- tripid: integer (nullable = true)
 |    |-- delay: integer (nullable = true)
 |    |-- distance: integer (nullable = true)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |-- v3: struct (nullable = false)
 |    |-- City: string (nullable = true)
 |    |-- State: stri

In [24]:
ver1 = g2.select('v1.City','v1.State','v1.Country','v1.id').dropDuplicates()
ver2 = g2.select('v2.City','v2.State','v2.Country','v2.id').dropDuplicates()
ver3 = g2.select('v3.City','v3.State','v3.Country','v3.id').dropDuplicates()

In [25]:
ver1.show()

+--------------+-----+-------+---+
|          City|State|Country| id|
+--------------+-----+-------+---+
|   New Orleans|   LA|    USA|MSY|
|     Cleveland|   OH|    USA|CLE|
|     Anchorage|   AK|    USA|ANC|
|    Birmingham|   AL|    USA|BHM|
|     Milwaukee|   WI|    USA|MKE|
|       Seattle|   WA|    USA|SEA|
|     St. Louis|   MO|    USA|STL|
|       Orlando|   FL|    USA|MCO|
|Honolulu, Oahu|   HI|    USA|HNL|
|   Kansas City|   MO|    USA|MCI|
|      San Jose|   CA|    USA|SJC|
|       Oakland|   CA|    USA|OAK|
|  Grand Rapids|   MI|    USA|GRR|
|  Indianapolis|   IN|    USA|IND|
|        Dallas|   TX|    USA|DAL|
|          Reno|   NV|    USA|RNO|
|      Columbus|   OH|    USA|CMH|
|      Key West|   FL|    USA|EYW|
| Santa Barbara|   CA|    USA|SBA|
| San Francisco|   CA|    USA|SFO|
+--------------+-----+-------+---+



In [26]:
e1 = g2.select('e1.tripid','e1.delay','e1.distance','e1.src','e1.dst').dropDuplicates()
e2 = g2.select('e2.tripid','e2.delay','e2.distance','e2.src','e2.dst').dropDuplicates()
e3 = g2.select('e3.tripid','e3.delay','e3.distance','e3.src','e3.dst').dropDuplicates()


In [27]:
v = ver1.union(ver2).union(ver3)
e = e1.union(e2).union(e3)

In [28]:
v.show()

+--------------+-----+-------+---+
|          City|State|Country| id|
+--------------+-----+-------+---+
|   New Orleans|   LA|    USA|MSY|
|     Cleveland|   OH|    USA|CLE|
|     Anchorage|   AK|    USA|ANC|
|    Birmingham|   AL|    USA|BHM|
|     Milwaukee|   WI|    USA|MKE|
|       Seattle|   WA|    USA|SEA|
|     St. Louis|   MO|    USA|STL|
|       Orlando|   FL|    USA|MCO|
|Honolulu, Oahu|   HI|    USA|HNL|
|   Kansas City|   MO|    USA|MCI|
|      San Jose|   CA|    USA|SJC|
|       Oakland|   CA|    USA|OAK|
|  Grand Rapids|   MI|    USA|GRR|
|  Indianapolis|   IN|    USA|IND|
|        Dallas|   TX|    USA|DAL|
|          Reno|   NV|    USA|RNO|
|      Columbus|   OH|    USA|CMH|
|      Key West|   FL|    USA|EYW|
| Santa Barbara|   CA|    USA|SBA|
| San Francisco|   CA|    USA|SFO|
+--------------+-----+-------+---+
only showing top 20 rows



In [29]:
e.show()

+-------+-----+--------+---+---+
| tripid|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1261825|   18|     510|BHM|DAL|
|1181305|   29|     927|MKE|MCO|
|1171234|    0|     228|SBA|SFO|
|1170720|    2|     605|SEA|SJC|
|2271045|   20|     233|MCO|EYW|
|2271025|   -3|    1472|MKE|SEA|
|2141355|   19|     379|MSY|DAL|
|2181714|   -6|     228|SBA|SFO|
|2071030|    8|    1294|SEA|MCI|
|3221750|    4|     474|DAL|STL|
|3020955|  -11|     583|OAK|SEA|
|3182003|   22|     228|SBA|SFO|
|3311015|   -4|    1259|SEA|ANC|
|3242055|  -10|     590|SEA|SFO|
|3050600|   -3|     766|STL|MCO|
|3311315|   -5|     766|STL|MCO|
|1021927|   13|     778|CLE|MCO|
|1080800|   -2|     766|MCO|STL|
|1281850|   -6|     276|MKE|STL|
|1051420|    9|     479|MSY|MCO|
+-------+-----+--------+---+---+
only showing top 20 rows



In [31]:
sub_gf = GraphFrame(v,e)

In [32]:
sub_gf.vertices.show()
sub_gf.edges.show()

+--------------+-----+-------+---+
|          City|State|Country| id|
+--------------+-----+-------+---+
|   New Orleans|   LA|    USA|MSY|
|     Cleveland|   OH|    USA|CLE|
|     Anchorage|   AK|    USA|ANC|
|    Birmingham|   AL|    USA|BHM|
|     Milwaukee|   WI|    USA|MKE|
|       Seattle|   WA|    USA|SEA|
|     St. Louis|   MO|    USA|STL|
|       Orlando|   FL|    USA|MCO|
|Honolulu, Oahu|   HI|    USA|HNL|
|   Kansas City|   MO|    USA|MCI|
|      San Jose|   CA|    USA|SJC|
|       Oakland|   CA|    USA|OAK|
|  Grand Rapids|   MI|    USA|GRR|
|  Indianapolis|   IN|    USA|IND|
|        Dallas|   TX|    USA|DAL|
|          Reno|   NV|    USA|RNO|
|      Columbus|   OH|    USA|CMH|
|      Key West|   FL|    USA|EYW|
| Santa Barbara|   CA|    USA|SBA|
| San Francisco|   CA|    USA|SFO|
+--------------+-----+-------+---+
only showing top 20 rows

+-------+-----+--------+---+---+
| tripid|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1261825|   18|     510|BHM|DAL|
|1

In [33]:
sub_gf2 = sub_gf.dropIsolatedVertices()

In [34]:
sub_gf2.edges.write.parquet('sub_gf_Edge',mode='overwrite')
sub_gf2.vertices.write.parquet('sub_gf_Vert',mode='overwrite')

In [38]:
e = spark.read.parquet('sub_gf_Edge')
v = spark.read.parquet('sub_gf_Vert')

In [39]:
v.show()

+---+--------------+-----+-------+
| id|          City|State|Country|
+---+--------------+-----+-------+
|HNL|Honolulu, Oahu|   HI|    USA|
|HNL|Honolulu, Oahu|   HI|    USA|
|HNL|Honolulu, Oahu|   HI|    USA|
|SBA| Santa Barbara|   CA|    USA|
|SBA| Santa Barbara|   CA|    USA|
|SBA| Santa Barbara|   CA|    USA|
|SFO| San Francisco|   CA|    USA|
|SFO| San Francisco|   CA|    USA|
|SFO| San Francisco|   CA|    USA|
|IND|  Indianapolis|   IN|    USA|
|IND|  Indianapolis|   IN|    USA|
|GRR|  Grand Rapids|   MI|    USA|
|GRR|  Grand Rapids|   MI|    USA|
|IND|  Indianapolis|   IN|    USA|
|GRR|  Grand Rapids|   MI|    USA|
|MCI|   Kansas City|   MO|    USA|
|MSY|   New Orleans|   LA|    USA|
|MCI|   Kansas City|   MO|    USA|
|MSY|   New Orleans|   LA|    USA|
|MCI|   Kansas City|   MO|    USA|
+---+--------------+-----+-------+
only showing top 20 rows



In [40]:
sub_graph = GraphFrame(v,e)

In [41]:
sub_graph.vertices.show()
sub_graph.edges.show()

+---+--------------+-----+-------+
| id|          City|State|Country|
+---+--------------+-----+-------+
|HNL|Honolulu, Oahu|   HI|    USA|
|HNL|Honolulu, Oahu|   HI|    USA|
|HNL|Honolulu, Oahu|   HI|    USA|
|SBA| Santa Barbara|   CA|    USA|
|SBA| Santa Barbara|   CA|    USA|
|SBA| Santa Barbara|   CA|    USA|
|SFO| San Francisco|   CA|    USA|
|SFO| San Francisco|   CA|    USA|
|SFO| San Francisco|   CA|    USA|
|IND|  Indianapolis|   IN|    USA|
|IND|  Indianapolis|   IN|    USA|
|GRR|  Grand Rapids|   MI|    USA|
|GRR|  Grand Rapids|   MI|    USA|
|IND|  Indianapolis|   IN|    USA|
|GRR|  Grand Rapids|   MI|    USA|
|MCI|   Kansas City|   MO|    USA|
|MSY|   New Orleans|   LA|    USA|
|MCI|   Kansas City|   MO|    USA|
|MSY|   New Orleans|   LA|    USA|
|MCI|   Kansas City|   MO|    USA|
+---+--------------+-----+-------+
only showing top 20 rows

+-------+-----+--------+---+---+
| tripid|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1051810|   33|     416|BHM|MCO|
|1