In [1]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import findspark
import pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Read lines from a file stream").getOrCreate()

In [3]:
from pyspark.sql.functions import *
import pyspark.sql.functions as f

24/09/07 10:20:04 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


#### Define schemas one for each folder of the provided data VertFinalExam and EdgesFinalExam

In [4]:
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType)

In [18]:
vertSchema = StructType([StructField('id', StringType(), True),
                           StructField('City', StringType(), True),
                           StructField('State', StringType(), True),
                           StructField('Country', StringType(), True)])

In [24]:
edgSchema = StructType([StructField('tripid', IntegerType(), True),
                        StructField('delay', IntegerType(), True),
                        StructField('distance', IntegerType(), True),
                        StructField('src', StringType(), True),
                        StructField('dst', StringType(), True)])

#### Create a streaming reader to read streaming data from the reading sources:

In [19]:
vert_df = spark.readStream.format("parquet")\
.schema(vertSchema)\
.load("/home/wick/Big_Data/final_project/VertFinalExam/")

In [22]:
vert_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)



In [25]:
edg_df = spark.readStream.format("parquet")\
.schema(edgSchema)\
.load("/home/wick/Big_Data/final_project/EdgesFinalExam/")

In [27]:
edg_df.printSchema()

root
 |-- tripid: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)



#### For the streaming Edges dataframe create a new column to indicate delay categories as follow:

    Early: for early delays (-ve delay values).
    Late: for delayed flights (+ve delay values).
    OnTime: for on time flights (0 delay values).


In [29]:
edg_df1 = edg_df.withColumn(
                              "delay_category",
                              when(col("delay") < 0, "Early")
                             .when(col("delay") > 0, "Late")
                             .otherwise("OnTime"))


In [30]:
edg_df1.printSchema()

root
 |-- tripid: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- delay_category: string (nullable = false)



#### For the streaming Vertices dataframe remove all rows that contain state as an emplty string state=""

In [31]:
vert_df1 = vert_df.filter(col("State") != "")
#vert_df1 = vert_df.where(col("State") != "")

#### Create a writer for the final streaming Edges dataframe to write the streaming data in writing sink in a parquet fromat.

In [33]:
edg_writer = edg_df1.writeStream.outputMode("append")\
                .format("parquet")\
                .option("path", "/home/wick/Big_Data/final_project/first_folder/")\
                .option("checkpointLocation", "/home/wick/Big_Data/final_project/edg_chkpnt/")

#### Create a writer for the final streaming Vertices dataframe to write the streaming data in writing sink in a parquet fromat.

In [34]:
vert_writer = vert_df1.writeStream.outputMode("append")\
                .format("parquet")\
                .option("path", "/home/wick/Big_Data/final_project/second_folder/")\
                .option("checkpointLocation", "/home/wick/Big_Data/final_project/vert_chkpnt/")

#### Start a query for the Edges writer. Copy and paste your EdgesFinalExam data to the edges streaming reading source. Wait to make sure the writing sink folder contains all data. Then stop the query.

In [35]:
edg_query = edg_writer.start()

24/09/06 23:49:13 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

In [36]:
edg_query.stop()

#### Start a query for the Vertices writer. Copy and paste your VertFinalExam data to the vertices streaming reading source. Wait to make sure the writing sink folder contains all data. Then stop the query.

In [37]:
vert_query = vert_writer.start()

24/09/06 23:52:14 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [38]:
vert_query.stop()

#### Using spark.read():

    Read the vertices data from the writing sink directory into static Vertices dataframe.
    Read the edges data from the writing sink directory into static Edges dataframe.


In [1]:
df_verts = spark.read.format('parquet')\
.load('/home/wick/Big_Data/final_project/second_folder/')

24/09/07 10:44:17 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [2]:
df_edgs = spark.read.format('parquet')\
.load('/home/wick/Big_Data/final_project/first_folder/')

In [3]:
df_verts.show(5)

                                                                                

+---+-----------+-----+-------+
| id|       City|State|Country|
+---+-----------+-----+-------+
|ABE|  Allentown|   PA|    USA|
|ABI|    Abilene|   TX|    USA|
|ABQ|Albuquerque|   NM|    USA|
|ABR|   Aberdeen|   SD|    USA|
|ABY|     Albany|   GA|    USA|
+---+-----------+-----+-------+
only showing top 5 rows



In [4]:
df_edgs.show(5)

+-------+-----+--------+---+---+--------------+
| tripid|delay|distance|src|dst|delay_category|
+-------+-----+--------+---+---+--------------+
|1010630|  -10|     928|RSW|EWR|         Early|
|1021029|   87|     974|RSW|ORD|          Late|
|1021346|    0|     928|RSW|EWR|        OnTime|
|1021044|   18|     928|RSW|EWR|          Late|
|1021730|   29|     748|RSW|IAH|          Late|
+-------+-----+--------+---+---+--------------+
only showing top 5 rows

