In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Admin\\anaconda3\\envs\\SparkEnvironment\\Lib\\site-packages\\pyspark'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
    SparkSession
    .builder
    .appName("MultiDataSetApp")
    .master("local[4]")
    .config("spark.dynamicAloocation.enabled", "false")
    .getOrCreate()
)

sc = spark.sparkContext

spark

In [3]:
# Create schema for Yellow Taxi Data
 
taxiSchema = (
                    StructType
                    ([ 
                        StructField("VendorId"               , IntegerType()   , True),
                        StructField("lpep_pickup_datetime"   , TimestampType() , True),
                        StructField("lpep_dropoff_datetime"  , TimestampType() , True),                            
                        StructField("passenger_count"        , DoubleType()    , True),
                        StructField("trip_distance"          , DoubleType()    , True),
                        StructField("RatecodeID"             , DoubleType()    , True),                            
                        StructField("store_and_fwd_flag"     , StringType()    , True),
                        StructField("PULocationID"           , IntegerType()   , True),
                        StructField("DOLocationID"           , IntegerType()   , True),                            
                        StructField("payment_type"           , IntegerType()   , True),                            
                        StructField("fare_amount"            , DoubleType()    , True),
                        StructField("extra"                  , DoubleType()    , True),
                        StructField("mta_tax"                , DoubleType()    , True),
                        StructField("tip_amount"             , DoubleType()    , True),
                        StructField("tolls_amount"           , DoubleType()    , True),
                        StructField("improvement_surcharge"  , DoubleType()    , True),
                        StructField("total_amount"           , DoubleType()    , True),
                        StructField("congestion_surcharge"   , DoubleType()    , True),
                        StructField("airport_fee"            , DoubleType()    , True)
                    ])
               )

In [4]:
yellowTaxiDf = (
    spark
    .read
    .option("header", "true")
    .schema(taxiSchema)
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)
yellowTaxiDf.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [5]:
yellowTaxiDf.createOrReplaceTempView("yellowTaxis")

In [6]:
taxiZoneSchema = "LocationID INT, Borough STRING, Zone STRING, serviceZone STRING"

taxiZonesDf = (
    spark
    .read
    .schema(taxiZoneSchema)
    .csv("C:\DataFiles\TaxiZones.csv")
)

taxiZonesDf.createOrReplaceTempView("TaxiZones")
taxiZonesDf.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- serviceZone: string (nullable = true)



In [8]:
joinedDf = (
    yellowTaxiDf,
    .join
    (
        taxiZonesDf,
        yellowTaxiDf.PULocationID == taxiZonesDF.LocationID
        "inner"
    )
)
joinedDf.printSchema()

SyntaxError: invalid syntax (3118549280.py, line 3)

In [11]:
driversDf = (
    spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("C:\DataFiles\Drivers.csv")
)

driversDf.createOrReplaceTempView("drivers")
driversDf.printSchema()
driversDf.show()

root
 |-- DriverLicenseNumber: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- ExpirationDate: string (nullable = true)
 |-- LastDateUpdated: string (nullable = true)

+-------------------+--------------------+--------------------+--------------+---------------+
|DriverLicenseNumber|                Name|                Type|ExpirationDate|LastDateUpdated|
+-------------------+--------------------+--------------------+--------------+---------------+
|            5430898|   ABDEL-BAR,ESLAM,M|MEDALLION TAXI DR...|    04/12/2023|     04/22/2020|
|            5363749|ABDOUSAMADOV,ALIC...|MEDALLION TAXI DR...|    06/01/2020|     04/22/2020|
|            5534446|  ABDUHALIKOV,RUSTAM|MEDALLION TAXI DR...|    06/16/2020|     04/22/2020|
|            5935702|   ABDULLAEV,JONIBEK|MEDALLION TAXI DR...|    03/14/2022|     04/22/2020|
|            5255097|ABDULNABI,MASHHOUR,H|MEDALLION TAXI DR...|    03/16/2021|     04/22/2020|
|            5778

In [13]:
cabsDf = (
    spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("C:\DataFiles\Cabs.csv")
)

cabsDf.createOrReplaceTempView("cabs")
cabsDf.printSchema()
cabsDf.show()


root
 |-- CabNumber: string (nullable = true)
 |-- VehicleLicenseNumber: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- LicenseType: string (nullable = true)
 |-- Active: string (nullable = true)
 |-- PermitLicenseNumber: string (nullable = true)
 |-- VehicleVinNumber: string (nullable = true)
 |-- WheelchairAccessible: string (nullable = true)
 |-- VehicleYear: integer (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- TelephoneNumber: string (nullable = true)
 |-- Website: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- LastDateUpdated: string (nullable = true)

+---------+--------------------+--------------------+----------------+------+-------------------+-----------------+--------------------+-----------+-----------+---------------+--------------------+--------------------+---------------+
|CabNumber|VehicleLicenseNumber|                Name|     LicenseType|Active|PermitLicenseNumber| VehicleVinNumber|WheelchairAccessible|Veh

In [14]:
#create list of all drivers

(spark.sql("""
(
select Name from cabs where LicenseType = 'OWNER MUST DRIVE'
)
union
(select name from drivers)
""")
).count()

8970