In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Admin\\anaconda3\\envs\\SparkEnvironment\\Lib\\site-packages\\pyspark'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
    SparkSession
    .builder
    .appName("SparkSQLApp")
    .master("local[4]")
    .config("spark.dynamicAllocation.enabled", "false")
    .config("spark.sql.adaptive.enabled", "false")
    .getOrCreate()
)

sc = spark.sparkContext

spark

In [3]:
# Create schema for Yellow Taxi Data
 
taxiSchema = (
                    StructType
                    ([ 
                        StructField("VendorId"               , IntegerType()   , True),
                        StructField("lpep_pickup_datetime"   , TimestampType() , True),
                        StructField("lpep_dropoff_datetime"  , TimestampType() , True),                            
                        StructField("passenger_count"        , DoubleType()    , True),
                        StructField("trip_distance"          , DoubleType()    , True),
                        StructField("RatecodeID"             , DoubleType()    , True),                            
                        StructField("store_and_fwd_flag"     , StringType()    , True),
                        StructField("PULocationID"           , IntegerType()   , True),
                        StructField("DOLocationID"           , IntegerType()   , True),                            
                        StructField("payment_type"           , IntegerType()   , True),                            
                        StructField("fare_amount"            , DoubleType()    , True),
                        StructField("extra"                  , DoubleType()    , True),
                        StructField("mta_tax"                , DoubleType()    , True),
                        StructField("tip_amount"             , DoubleType()    , True),
                        StructField("tolls_amount"           , DoubleType()    , True),
                        StructField("improvement_surcharge"  , DoubleType()    , True),
                        StructField("total_amount"           , DoubleType()    , True),
                        StructField("congestion_surcharge"   , DoubleType()    , True),
                        StructField("airport_fee"            , DoubleType()    , True)
                    ])
               )

In [4]:
yellowTaxiDf = (
    spark
    .read
    .option("header", "true")
    .schema(taxiSchema)
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)
yellowTaxiDf.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [5]:
yellowTaxiDf.createOrReplaceTempView("yellowTaxis")

In [6]:
outputDf = spark.sql("select * from YellowTaxis where PULocationID = 171")
outputDf.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2022-10-01 13:47:23|  2022-10-01 14:38:50|            1.0|          9.4|      99.0|                 N|         171|         263|           1|       35.2|  0.0|    0.5|       0.

In [7]:
greenTaxiDf = (
    spark
    .read
    .option("header", "true")
    .option("delimiter", "\t")
    .csv("C:\DataFiles\GreenTaxis_202210.csv")
)

greenTaxiDf.createOrReplaceTempView("GreenTaxis")

In [33]:
unionTaxi = spark.sql("""
select 'Yellow' as TaxiType,
lpep_pickup_datetime as PickupTime,
lpep_dropoff_datetime as dropTime,
PULocationID as PickupLocationId,
DOLocationID as DropLocationId
from YellowTaxis

union all

select 'Green' as TaxiType,
lpep_pickup_datetime as PickupTime,
lpep_dropoff_datetime as dropTime,
PULocationID as PickupLocationId,
DOLocationID as DropLocationId
from GreenTaxis
""")

unionTaxi.show()



+--------+-------------------+-------------------+----------------+--------------+
|TaxiType|         PickupTime|           dropTime|PickupLocationId|DropLocationId|
+--------+-------------------+-------------------+----------------+--------------+
|  Yellow|2022-10-01 05:33:41|2022-10-01 05:48:39|             249|           107|
|  Yellow|2022-10-01 05:44:30|2022-10-01 05:49:48|             151|           238|
|  Yellow|2022-10-01 05:57:13|2022-10-01 06:07:41|             238|           166|
|  Yellow|2022-10-01 06:02:53|2022-10-01 06:08:55|             142|           239|
|  Yellow|2022-10-01 06:14:55|2022-10-01 06:20:21|             238|           166|
|  Yellow|2022-10-01 05:52:52|2022-10-01 06:22:14|             186|            41|
|  Yellow|2022-10-01 06:03:19|2022-10-01 06:14:51|             162|           145|
|  Yellow|2022-10-01 05:32:42|2022-10-01 06:20:01|             100|            22|
|  Yellow|2022-10-01 05:36:35|2022-10-01 05:54:38|             138|           112|
|  Y

In [25]:
taxiZoneSchema = "LocationID INT, Borough STRING, Zone STRING, serviceZone STRING"

taxiZonesDf = (
    spark
    .read
    .schema(taxiZoneSchema)
    .csv("C:\DataFiles\TaxiZones.csv")
)

taxiZonesDf.createOrReplaceGlobalTempView("TaxiZones")
taxiZonesDf.show(200)



+----------+-------------+--------------------+-----------+
|LocationID|      Borough|                Zone|serviceZone|
+----------+-------------+--------------------+-----------+
|         1|          EWR|      Newark Airport|        EWR|
|         2|       Queens|         Jamaica Bay|  Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|  Boro Zone|
|         4|    Manhattan|       Alphabet City|Yellow Zone|
|         5|Staten Island|       Arden Heights|  Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|  Boro Zone|
|         7|       Queens|             Astoria|  Boro Zone|
|         8|       Queens|        Astoria Park|  Boro Zone|
|         9|       Queens|          Auburndale|  Boro Zone|
|        10|       Queens|        Baisley Park|  Boro Zone|
|        11|     Brooklyn|          Bath Beach|  Boro Zone|
|        12|    Manhattan|        Battery Park|Yellow Zone|
|        13|    Manhattan|   Battery Park City|Yellow Zone|
|        14|     Brooklyn|           Bay

In [34]:
unionTaxi.createOrReplaceGlobalTempView("YellowGreenTaxi")

In [None]:
spark.sql("""
select AVG(HP) from Yell
""")