## The following section is for Colab Users.
### Just run the following code cells

In [1]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://bitbucket.org/habedi/datasets/raw/b6769c4664e7ff68b001e2f43bc517888cbe3642/spark/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!rm -rf spark-3.0.2-bin-hadoop2.7.tgz*
!pip -q install findspark pyspark graphframes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
!wget https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.0-s_2.12/graphframes-0.8.2-spark3.0-s_2.12.jar -P /content/spark-3.0.2-bin-hadoop2.7/jars/
!cp /content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar /content/spark-3.0.2-bin-hadoop2.7/graphframes-0.8.2-spark3.0-s_2.12.zip

--2024-06-25 21:29:36--  https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.0-s_2.12/graphframes-0.8.2-spark3.0-s_2.12.jar
Resolving repos.spark-packages.org (repos.spark-packages.org)... 52.85.151.57, 52.85.151.5, 52.85.151.46, ...
Connecting to repos.spark-packages.org (repos.spark-packages.org)|52.85.151.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 247882 (242K) [binary/octet-stream]
Saving to: ‘/content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar’


2024-06-25 21:29:36 (33.2 MB/s) - ‘/content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar’ saved [247882/247882]



In [3]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = os.environ["SPARK_HOME"]

os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

In [4]:
import findspark
findspark.init()

In [5]:
!export PYSPARK_SUBMIT_ARGS="--master local[*] pyspark-shell"
!export PYSPARK_DRIVER_PYTHON=jupyter
!export PYSPARK_DRIVER_PYTHON_OPTS=notebook

In [6]:
from pyspark.sql import SparkSession
from graphframes import *

spark = SparkSession.builder.master("local[*]").appName("GraphFrames").getOrCreate()

In [7]:
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell"

**************************************************************************
**************************************************************************
**************************************************************************

In [8]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Read departuredelays.csv in Edge DataFrame
### Read airport-codes-na.txt in Vertix DataFrame (the separator is Tab i.e sep = '\t' )

#### The US flight delays data set has five columns:
- The <b>date</b> column contains an integer like 02190925 . When converted, this maps to 02-19 09:25 am.
- The <b>delay</b> column gives the delay in minutes between the scheduled and actual departure times. Early departures show negative numbers.
- The <b>distance</b> column gives the distance in miles from the origin airport to the destination airport.
- The <b>origin</b> column contains the origin IATA airport code.
- The <b>destination</b> column contains the destination IATA airport code.

#### The airport-codes data set has four columns:
- The <b>IATA</b> column contains IATA airport code.
- The <b>City, State, and Country</b> columns contains information about the airport location.

In [9]:
spark = SparkSession.builder \
    .appName("Read DataFrames") \
    .getOrCreate()

In [10]:
edge_df = spark.read.csv('/content/departuredelays.csv', header=True, inferSchema=True)

In [12]:
edge_df.printSchema()

root
 |-- date: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [11]:
vertex_df = spark.read.csv('/content/airport-codes-na.txt', sep='\t', header=True, inferSchema=True)

### In the vertix DataFrame, drop any duplicated rows with the same  IATA code.

In [13]:
vertex_df = vertex_df.dropDuplicates(["IATA"])

### In the edges DataFrame:
- Rename the <b>date</b> columns to become <b>tripid</b>.
- Rename the <b>origin</b> columns to become <b>src</b>.
- Rename the <b>destination</b> columns to become <b>dst</b>.

In [14]:
edge_df = edge_df \
    .withColumnRenamed("date", "tripid") \
    .withColumnRenamed("origin", "src") \
    .withColumnRenamed("destination", "dst")

### In the Vertix DataFrame:
- Rename the <b>IATA</b> columns to become <b>id</b>.

In [15]:
vertex_df = vertex_df.withColumnRenamed("IATA", "id")

### Create GraphFrame from Vertix and Edges DataFrames

In [16]:
graph = GraphFrame(vertex_df, edge_df)
graph.vertices.show()
graph.edges.show()

+-------------------+-----+-------+---+
|               City|State|Country| id|
+-------------------+-----+-------+---+
|         Binghamton|   NY|    USA|BGM|
|            Lebanon|   NH|    USA|LEB|
|           Montreal|   PQ| Canada|YUL|
|         Dillingham|   AK|    USA|DLG|
|International Falls|   MN|    USA|INL|
|         Wolf Point|   MT|    USA|OLF|
|        New Orleans|   LA|    USA|MSY|
|            Toronto|   ON| Canada|YTO|
|            Spokane|   WA|    USA|GEG|
|              Havre|   MT|    USA|HVR|
|            Burbank|   CA|    USA|BUR|
|      Orange County|   CA|    USA|SNA|
|             Dryden|   ON| Canada|YHD|
|         Fort Dodge|   IA|    USA|FOD|
|          Green Bay|   WI|    USA|GRB|
|        Great Falls|   MT|    USA|GTF|
|              Homer|   AK|    USA|HOM|
|        Idaho Falls|   ID|    USA|IDA|
|      Sioux Lookout|   ON| Canada|YXL|
|       Grand Rapids|   MI|    USA|GRR|
+-------------------+-----+-------+---+
only showing top 20 rows

+-------+-----

### Determine the number of airports

In [17]:
graph.vertices.count()

524

### Determine the number of trips

In [18]:
graph.edges.count()

1391578

### What is the longest delay?

In [30]:
from pyspark.sql.functions import *

In [29]:
graph.edges.agg(max("delay")).collect()[0][0]

1642

### Find out the number of delayed flights vs. early flights (flights that departed before actual time)

In [31]:
edge_df.filter(col("delay") > 0).count()

591727

In [32]:
edge_df.filter(col("delay") < 0).count()

668729

### What flight destinations departing SFO are most likely to have significant delays? Select the top 10
#### Hint: you should get the average delay for each destination for trips that depart from SFO only

In [33]:
sfo_flights = edge_df.filter(col("src") == "SFO")
avg_delay_per_destination = sfo_flights.groupBy("dst").agg(avg("delay").alias("average_delay"))
top_10_destinations = avg_delay_per_destination.orderBy(col("average_delay").desc()).limit(10)
top_10_destinations.show()

+---+------------------+
|dst|     average_delay|
+---+------------------+
|JAC| 30.78846153846154|
|OKC|24.822222222222223|
|SUN|22.696629213483146|
|COS| 22.58888888888889|
|SAT|             22.16|
|STL|         20.203125|
|HNL|19.982608695652175|
|ASE|19.846153846153847|
|CEC|19.089820359281436|
|MDW|18.771929824561404|
+---+------------------+



### Find the Incoming connections to the airport sorted in Desc. order.

In [34]:
incoming_connections = edge_df.groupBy("dst").count()
sorted_incoming_connections = incoming_connections.orderBy(col("count").desc())
sorted_incoming_connections.show()

+---+-----+
|dst|count|
+---+-----+
|ATL|90434|
|DFW|66050|
|ORD|61967|
|LAX|53601|
|DEN|50921|
|IAH|42700|
|PHX|39721|
|SFO|38988|
|LAS|32994|
|CLT|28388|
|MCO|27959|
|EWR|27652|
|LGA|25469|
|BOS|25360|
|SLC|25323|
|JFK|23484|
|DTW|23310|
|SEA|23074|
|MSP|22385|
|MIA|21805|
+---+-----+
only showing top 20 rows



### Find the Outgoing connections from the airport sorted in Desc. order.

In [35]:
outgoing_connections = edge_df.groupBy("src").count()
sorted_outgoing_connections = outgoing_connections.orderBy(col("count").desc())
sorted_outgoing_connections.show()

+---+-----+
|src|count|
+---+-----+
|ATL|91484|
|DFW|68482|
|ORD|64228|
|LAX|54086|
|DEN|53148|
|IAH|43361|
|PHX|40155|
|SFO|39483|
|LAS|33107|
|CLT|28402|
|MCO|28313|
|EWR|27656|
|SLC|25868|
|LGA|25458|
|BOS|25348|
|MSP|24031|
|JFK|23572|
|DTW|23421|
|SEA|23078|
|MIA|21817|
+---+-----+
only showing top 20 rows



### Use motif finding to answer this question: which delays could we blame on SFO?
#### Hint: this practically means that SFO is a transit station

In [36]:
motifs = graph.find("(a)-[e1]->(b); (b)-[e2]->(c)")
sfo_transit_delays = motifs.filter((col("b.id") == "SFO") & ((col("e1.delay") > 0) | (col("e2.delay") > 0)))
sfo_transit_delays.select("e1.src", "e1.dst", "e1.delay", "e2.src", "e2.dst", "e2.delay").show(truncate=False)

+---+---+-----+---+---+-----+
|src|dst|delay|src|dst|delay|
+---+---+-----+---+---+-----+
|ABQ|SFO|-7   |SFO|JFK|55   |
|ABQ|SFO|-7   |SFO|DFW|134  |
|ABQ|SFO|-7   |SFO|ORD|32   |
|ABQ|SFO|-7   |SFO|DFW|3    |
|ABQ|SFO|-7   |SFO|ORD|124  |
|ABQ|SFO|-7   |SFO|LAX|139  |
|ABQ|SFO|-7   |SFO|JFK|133  |
|ABQ|SFO|-7   |SFO|ORD|113  |
|ABQ|SFO|-7   |SFO|LAX|8    |
|ABQ|SFO|-7   |SFO|MIA|18   |
|ABQ|SFO|-7   |SFO|DFW|2    |
|ABQ|SFO|-7   |SFO|ORD|9    |
|ABQ|SFO|-7   |SFO|ORD|326  |
|ABQ|SFO|-7   |SFO|DFW|1    |
|ABQ|SFO|-7   |SFO|ORD|34   |
|ABQ|SFO|-7   |SFO|DFW|1    |
|ABQ|SFO|-7   |SFO|ORD|190  |
|ABQ|SFO|-7   |SFO|LAX|9    |
|ABQ|SFO|-7   |SFO|JFK|111  |
|ABQ|SFO|-7   |SFO|DFW|103  |
+---+---+-----+---+---+-----+
only showing top 20 rows



### Determine Airport Ranking in Desc. order using PageRank algorithm

In [37]:
results = graph.pageRank(resetProbability=0.15, maxIter=5)
ranked_airports = results.vertices.select("id", "pagerank").orderBy("pagerank", ascending=False)
ranked_airports.show(truncate=False)

+---+------------------+
|id |pagerank          |
+---+------------------+
|ATL|30.852689637281415|
|DFW|22.35090825185797 |
|ORD|21.476110490648225|
|DEN|15.864147080276451|
|LAX|14.200409160093676|
|IAH|13.038206739080561|
|SFO|11.262952692947371|
|PHX|10.61464841238288 |
|SLC|9.46228920900108  |
|LAS|8.571473320234631 |
|SEA|7.548959663023094 |
|EWR|7.180791090543533 |
|MCO|7.160926547658392 |
|CLT|7.143654438927901 |
|DTW|6.890631890038128 |
|MSP|6.7692888436047465|
|LGA|6.737391483201898 |
|BOS|6.219575898669399 |
|BWI|5.761252754817864 |
|JFK|5.7462085111883   |
+---+------------------+
only showing top 20 rows



## Determine the most popular flights (single city hops)

In [38]:
popular_flights = edge_df.groupBy("src", "dst").count()
sorted_popular_flights = popular_flights.orderBy(col("count").desc())
sorted_popular_flights.show(truncate=False)

+---+---+-----+
|src|dst|count|
+---+---+-----+
|SFO|LAX|3232 |
|LAX|SFO|3198 |
|LAS|LAX|3016 |
|LAX|LAS|2964 |
|JFK|LAX|2720 |
|LAX|JFK|2719 |
|ATL|LGA|2501 |
|LGA|ATL|2500 |
|LAX|PHX|2394 |
|PHX|LAX|2387 |
|HNL|OGG|2380 |
|OGG|HNL|2379 |
|LAX|SAN|2215 |
|SAN|LAX|2214 |
|SJC|LAX|2208 |
|LAX|SJC|2201 |
|ATL|MCO|2136 |
|MCO|ATL|2090 |
|SFO|JFK|2084 |
|JFK|SFO|2084 |
+---+---+-----+
only showing top 20 rows



### Find and Save a Subragph that obtained from the following pattern:
#### The flight starts from an airport and return back to the same airport through 2 other airports.

In [None]:
matching_subgraphs = graph.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(a)")
matching_subgraphs.write.parquet("matching_subgraphs.parquet")
print("Matching Subgraphs:")
#matching_subgraphs.select("A", "B", "C").show()