## The following section is for Colab Users.
### Just run the following code cells

In [1]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://bitbucket.org/habedi/datasets/raw/b6769c4664e7ff68b001e2f43bc517888cbe3642/spark/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!rm -rf spark-3.0.2-bin-hadoop2.7.tgz*
!pip -q install findspark pyspark graphframes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
!wget https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.0-s_2.12/graphframes-0.8.2-spark3.0-s_2.12.jar -P /content/spark-3.0.2-bin-hadoop2.7/jars/
!cp /content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar /content/spark-3.0.2-bin-hadoop2.7/graphframes-0.8.2-spark3.0-s_2.12.zip

--2024-06-04 00:49:32--  https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.0-s_2.12/graphframes-0.8.2-spark3.0-s_2.12.jar
Resolving repos.spark-packages.org (repos.spark-packages.org)... 108.157.254.128, 108.157.254.42, 108.157.254.58, ...
Connecting to repos.spark-packages.org (repos.spark-packages.org)|108.157.254.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 247882 (242K) [binary/octet-stream]
Saving to: ‘/content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar’


2024-06-04 00:49:33 (480 KB/s) - ‘/content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar’ saved [247882/247882]



In [3]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = os.environ["SPARK_HOME"]

os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

In [4]:
! pip install pyspark



In [5]:
import findspark
findspark.init()

In [6]:
!export PYSPARK_SUBMIT_ARGS="--master local[*] pyspark-shell"
!export PYSPARK_DRIVER_PYTHON=jupyter
!export PYSPARK_DRIVER_PYTHON_OPTS=notebook

In [7]:
from pyspark.sql import SparkSession
from graphframes import *

spark = SparkSession.builder.master("local[*]").appName("GraphFrames").getOrCreate()

In [8]:
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell"

**************************************************************************
**************************************************************************
**************************************************************************

In [9]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Read departuredelays.csv in Edge DataFrame
### Read airport-codes-na.txt in Vertix DataFrame (the separator is Tab i.e sep = '\t' )

#### The US flight delays data set has five columns:
- The <b>date</b> column contains an integer like 02190925 . When converted, this maps to 02-19 09:25 am.
- The <b>delay</b> column gives the delay in minutes between the scheduled and actual departure times. Early departures show negative numbers.
- The <b>distance</b> column gives the distance in miles from the origin airport to the destination airport.
- The <b>origin</b> column contains the origin IATA airport code.
- The <b>destination</b> column contains the destination IATA airport code.

#### The airport-codes data set has four columns:
- The <b>IATA</b> column contains IATA airport code.
- The <b>City, State, and Country</b> columns contains information about the airport location.

In [11]:
edges_df = spark.read.csv('/content/departuredelays.csv', header=True, inferSchema=True)

In [12]:
vertices_df = spark.read.csv('/content/airport-codes-na.txt', sep='\t', header=True, inferSchema=True)


In [13]:
edges_df.printSchema()
vertices_df.printSchema()

root
 |-- date: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)



### In the vertix DataFrame, drop any duplicated rows with the same  IATA code.

In [14]:
vertices_df = vertices_df.dropDuplicates(['IATA'])

### In the edges DataFrame:
- Rename the <b>date</b> columns to become <b>tripid</b>.
- Rename the <b>origin</b> columns to become <b>src</b>.
- Rename the <b>destination</b> columns to become <b>dst</b>.

In [15]:
edges_df = edges_df.withColumnRenamed('date', 'tripid') \
                   .withColumnRenamed('origin', 'src') \
                   .withColumnRenamed('destination', 'dst')

### In the Vertix DataFrame:
- Rename the <b>IATA</b> columns to become <b>id</b>.

In [16]:
vertices_df = vertices_df.withColumnRenamed('IATA', 'id')

In [17]:
edges_df.show(5)
vertices_df.show(5)

+-------+-----+--------+---+---+
| tripid|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1011245|    6|     602|ABE|ATL|
|1020600|   -8|     369|ABE|DTW|
|1021245|   -2|     602|ABE|ATL|
|1020605|   -4|     602|ABE|ATL|
|1031245|   -4|     602|ABE|ATL|
+-------+-----+--------+---+---+
only showing top 5 rows

+-------------------+-----+-------+---+
|               City|State|Country| id|
+-------------------+-----+-------+---+
|         Binghamton|   NY|    USA|BGM|
|            Lebanon|   NH|    USA|LEB|
|           Montreal|   PQ| Canada|YUL|
|         Dillingham|   AK|    USA|DLG|
|International Falls|   MN|    USA|INL|
+-------------------+-----+-------+---+
only showing top 5 rows



### Create GraphFrame from Vertix and Edges DataFrames

In [18]:
from graphframes import GraphFrame
g = GraphFrame(vertices_df, edges_df)

In [19]:
g.vertices.show(5)
g.edges.show(5)

+-------------------+-----+-------+---+
|               City|State|Country| id|
+-------------------+-----+-------+---+
|         Binghamton|   NY|    USA|BGM|
|            Lebanon|   NH|    USA|LEB|
|           Montreal|   PQ| Canada|YUL|
|         Dillingham|   AK|    USA|DLG|
|International Falls|   MN|    USA|INL|
+-------------------+-----+-------+---+
only showing top 5 rows

+-------+-----+--------+---+---+
| tripid|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1011245|    6|     602|ABE|ATL|
|1020600|   -8|     369|ABE|DTW|
|1021245|   -2|     602|ABE|ATL|
|1020605|   -4|     602|ABE|ATL|
|1031245|   -4|     602|ABE|ATL|
+-------+-----+--------+---+---+
only showing top 5 rows



### Determine the number of airports

In [20]:
num_airports = g.vertices.count()
num_airports

524

### Determine the number of trips

In [21]:
num_trips = g.edges.count()
num_trips

1391578

### What is the longest delay?

In [22]:
longest_delay = g.edges.agg({"delay": "max"}).collect()[0][0]
longest_delay

1642

### Find out the number of delayed flights vs. early flights (flights that departed before actual time)

In [23]:
delay_counts = g.edges.groupBy(g.edges.delay > 0).count().collect()
delayed_flights = delay_counts[0][1] if delay_counts[0][0] else delay_counts[1][1]
early_flights = delay_counts[1][1] if delay_counts[0][0] else delay_counts[0][1]
print(f"Delayed flights: {delayed_flights}, Early flights: {early_flights}")

Delayed flights: 591727, Early flights: 799851


### What flight destinations departing SFO are most likely to have significant delays? Select the top 10
#### Hint: you should get the average delay for each destination for trips that depart from SFO only

In [24]:
from pyspark.sql.functions import desc

sfo_delays = g.edges.filter(g.edges.src == 'SFO') \
    .groupBy('dst').avg('delay') \
    .withColumnRenamed('avg(delay)', 'avg_delay') \
    .orderBy(desc('avg_delay')) \
    .limit(10)

sfo_delays.show()

+---+------------------+
|dst|         avg_delay|
+---+------------------+
|JAC| 30.78846153846154|
|OKC|24.822222222222223|
|SUN|22.696629213483146|
|COS| 22.58888888888889|
|SAT|             22.16|
|STL|         20.203125|
|HNL|19.982608695652175|
|ASE|19.846153846153847|
|CEC|19.089820359281436|
|MDW|18.771929824561404|
+---+------------------+



### Find the Incoming connections to the airport sorted in Desc. order.

In [25]:
incoming_connections = g.edges.groupBy("dst").count().orderBy(desc("count"))
incoming_connections.show()

+---+-----+
|dst|count|
+---+-----+
|ATL|90434|
|DFW|66050|
|ORD|61967|
|LAX|53601|
|DEN|50921|
|IAH|42700|
|PHX|39721|
|SFO|38988|
|LAS|32994|
|CLT|28388|
|MCO|27959|
|EWR|27652|
|LGA|25469|
|BOS|25360|
|SLC|25323|
|JFK|23484|
|DTW|23310|
|SEA|23074|
|MSP|22385|
|MIA|21805|
+---+-----+
only showing top 20 rows



### Find the Outgoing connections from the airport sorted in Desc. order.

In [26]:
outgoing_connections = g.edges.groupBy("src").count().orderBy(desc("count"))
outgoing_connections.show()

+---+-----+
|src|count|
+---+-----+
|ATL|91484|
|DFW|68482|
|ORD|64228|
|LAX|54086|
|DEN|53148|
|IAH|43361|
|PHX|40155|
|SFO|39483|
|LAS|33107|
|CLT|28402|
|MCO|28313|
|EWR|27656|
|SLC|25868|
|LGA|25458|
|BOS|25348|
|MSP|24031|
|JFK|23572|
|DTW|23421|
|SEA|23078|
|MIA|21817|
+---+-----+
only showing top 20 rows



### Use motif finding to answer this question: which delays could we blame on SFO?
#### Hint: this practically means that SFO is a transit station

In [27]:
motifs_delay = g.find("(a)-[e1]->(b); (b)-[e2]->(c)").filter("a.id = 'SFO' and e1.delay > 0 and e2.delay > 0")
motifs_delay.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   a|                  e1|                   b|                  e2|                   c|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|[San Francisco, C...|[1011750, 163, 16...|[New Orleans, LA,...|[1011910, 19, 389...|[Dallas, TX, USA,...|
|[San Francisco, C...|[1011750, 163, 16...|[New Orleans, LA,...|[1021335, 10, 389...|[Dallas, TX, USA,...|
|[San Francisco, C...|[1011750, 163, 16...|[New Orleans, LA,...|[1021550, 81, 389...|[Dallas, TX, USA,...|
|[San Francisco, C...|[1011750, 163, 16...|[New Orleans, LA,...|[1021505, 18, 586...|[Miami, FL, USA, ...|
|[San Francisco, C...|[1011750, 163, 16...|[New Orleans, LA,...|[1021010, 9, 389,...|[Dallas, TX, USA,...|
|[San Francisco, C...|[1011750, 163, 16...|[New Orleans, LA,...|[1031335, 11, 389...|[Dallas, TX, USA,...|
|[San Francisco, C...|[1011750, 163, 

### Determine Airport Ranking in Desc. order using PageRank algorithm

In [28]:
# Sample 1% of the edges DataFrame to create a smaller subset
sampled_edges_df = edges_df.sample(fraction=0.01, seed=42)

# Create a new GraphFrame using the sampled edges DataFrame
sampled_g = GraphFrame(vertices_df, sampled_edges_df)

In [29]:
results = sampled_g.pageRank(resetProbability=0.15, maxIter=10)
results.vertices.select("id", "pagerank").orderBy(desc("pagerank")).show()

+---+------------------+
| id|          pagerank|
+---+------------------+
|ATL|28.972251142802747|
|DFW|22.235881120169132|
|ORD| 20.98620855918177|
|DEN|15.279574477436306|
|LAX|13.995423148618006|
|IAH|13.471657301075073|
|SFO|11.546641661479331|
|PHX|11.229570699109104|
|SLC| 9.525621523366365|
|LAS|7.8423359421900996|
|DTW| 7.367102499612098|
|MCO| 7.318610914953621|
|SEA| 7.030451887681951|
|LGA| 6.758826045392688|
|EWR| 6.652304930214718|
|MSP|6.5667102172377865|
|CLT| 6.454068246017832|
|JFK| 6.345773972706368|
|BOS| 5.754206887993827|
|MIA| 5.342552894228703|
+---+------------------+
only showing top 20 rows



## Determine the most popular flights (single city hops)

In [30]:
popular_flights = sampled_g.edges.groupBy("src", "dst").count().orderBy(desc("count")).limit(10)
popular_flights.show()

+---+---+-----+
|src|dst|count|
+---+---+-----+
|LAX|SFO|   43|
|JFK|LAX|   38|
|SFO|LAX|   37|
|LAX|LAS|   34|
|ATL|LGA|   31|
|SFO|JFK|   31|
|OGG|HNL|   30|
|DCA|BOS|   29|
|SFO|LAS|   28|
|LGA|ATL|   28|
+---+---+-----+



### Find and Save a Subragph that obtained from the following pattern:
#### The flight starts from an airport and return back to the same airport through 2 other airports.

In [31]:
from pyspark.sql.functions import col

motif = sampled_g.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(a)")

In [32]:
motif.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   a|                  e1|                   b|                  e2|                   c|                  e3|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,...|[3171915, 45, 589...|
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,...|[2190935, -9, 589...|
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,...|[2261620, -4, 589...|
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,...|[1111540, 7, 589,...|
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,..

In [33]:
filtered_motif = motif.filter((col("a.id") != col("b.id")) &
                              (col("b.id") != col("c.id")) &
                              (col("a.id") != col("c.id")))

In [34]:
filtered_motif.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   a|                  e1|                   b|                  e2|                   c|                  e3|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,...|[3171915, 45, 589...|
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,...|[2190935, -9, 589...|
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,...|[2261620, -4, 589...|
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,...|[1111540, 7, 589,...|
|[Albuquerque, NM,...|[1030605, 2, 494,...|[Dallas, TX, USA,...|[3091745, 0, 1073...|[Los Angeles, CA,..

In [36]:
renamed_motif = filtered_motif.select(
    col("a.id").alias("airport_a"),
    col("b.id").alias("airport_b"),
    col("c.id").alias("airport_c"),
    col("e1.tripid").alias("tripid_1"),
    col("e2.tripid").alias("tripid_2"),
    col("e3.tripid").alias("tripid_3")
)

In [38]:
renamed_motif.show(10)

+---------+---------+---------+--------+--------+--------+
|airport_a|airport_b|airport_c|tripid_1|tripid_2|tripid_3|
+---------+---------+---------+--------+--------+--------+
|      ABQ|      DFW|      LAX| 1030605| 3091745| 3171915|
|      ABQ|      DFW|      LAX| 1030605| 3091745| 2190935|
|      ABQ|      DFW|      LAX| 1030605| 3091745| 2261620|
|      ABQ|      DFW|      LAX| 1030605| 3091745| 1111540|
|      ABQ|      DFW|      LAX| 1030605| 3091745| 1040714|
|      ABQ|      DFW|      LAX| 1030605| 3050705| 3171915|
|      ABQ|      DFW|      LAX| 1030605| 3050705| 2190935|
|      ABQ|      DFW|      LAX| 1030605| 3050705| 2261620|
|      ABQ|      DFW|      LAX| 1030605| 3050705| 1111540|
|      ABQ|      DFW|      LAX| 1030605| 3050705| 1040714|
+---------+---------+---------+--------+--------+--------+
only showing top 10 rows



In [39]:
renamed_motif.write.csv('/content/subgraph_sampled.csv')