In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Admin\\anaconda3\\envs\\SparkEnvironment\\Lib\\site-packages\\pyspark'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
    SparkSession
    .builder
    .appName("TaxiOperationsDataFrameApp")
    .master("local[4]")
    .config("spark.dynamicAloocation.enabled", "false")
    .getOrCreate()
)

sc = spark.sparkContext

spark

In [12]:
data = [
    [1,"Neha", 10000],
    [2, "Steve", 20000],
    [3, "Kari", 30000],
    [4, "Ivan", 40000],
    [5, "Mohit", 50000]
]

employeesRdd = sc.parallelize(data)

In [14]:
employeesDf = employeesRdd.toDF()
employeesDf.show()

+---+-----+-----+
| _1|   _2|   _3|
+---+-----+-----+
|  1| Neha|10000|
|  2|Steve|20000|
|  3| Kari|30000|
|  4| Ivan|40000|
|  5|Mohit|50000|
+---+-----+-----+



In [16]:
employeesDf = employeesDf.toDF("Id", "Name", "Salary")
employeesDf.show()

+---+-----+------+
| Id| Name|Salary|
+---+-----+------+
|  1| Neha| 10000|
|  2|Steve| 20000|
|  3| Kari| 30000|
|  4| Ivan| 40000|
|  5|Mohit| 50000|
+---+-----+------+



In [13]:
employeesDf= (
    spark
    .createDataFrame
    (
        data,
        "Id:long, Name:string, Salary:long"
    )
)
employeesDf.show()

+---+-----+------+
| Id| Name|Salary|
+---+-----+------+
|  1| Neha| 10000|
|  2|Steve| 20000|
|  3| Kari| 30000|
|  4| Ivan| 40000|
|  5|Mohit| 50000|
+---+-----+------+



In [3]:
yellowTaxiDf = (
    spark
    .read
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)

yellowTaxiDf.show()

+--------+--------------------+--------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+--------------------+------------+--------------------+-----------+
|     _c0|                 _c1|                 _c2|            _c3|          _c4|       _c5|               _c6|         _c7|         _c8|         _c9|       _c10| _c11|   _c12|      _c13|        _c14|                _c15|        _c16|                _c17|       _c18|
+--------+--------------------+--------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+--------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_date...|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls

In [3]:
yellowTaxiDf = (
    spark
    .read
    .option("header", "true")
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)

yellowTaxiDf.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1|2022-10-01T05:33:...| 2022-10-01T05:48:...|            1.0|          1.7|       1.0|                 N|         249|         107|           1|        9.5|  3.0|    0.5|      2.6

In [4]:
greenTaxiDf = (
    spark
    .read
    .option("header", "true")
    .option("delimiter", "\t")
    .csv("C:\DataFiles\GreenTaxis_202210.csv")
)

greenTaxiDf.show(vertical = True)

-RECORD 0-------------------------------------
 VendorId              | 2                    
 lpep_pickup_datetime  | 2022-10-01T06:08:... 
 lpep_dropoff_datetime | 2022-10-01T06:21:... 
 passenger_count       | 1.0                  
 trip_distance         | 2.47                 
 RatecodeID            | 1.0                  
 store_and_fwd_flag    | N                    
 PULocationID          | 256                  
 DOLocationID          | 225                  
 payment_type          | 1.0                  
 fare_amount           | 11.5                 
 extra                 | 0.5                  
 mta_tax               | 0.5                  
 tip_amount            | 2.56                 
 tolls_amount          | 0.0                  
 improvement_surcharge | 0.3                  
 total_amount          | 15.36                
 congestion_surcharge  | 0.0                  
 airport_fee           | 0.0                  
-RECORD 1-------------------------------------
 VendorId    

In [23]:
paymentTypesDf = (
    spark
    .read
    .json("C:\DataFiles\PaymentTypes.json")
    
)

paymentTypesDf.show()

+-----------+-------------+
|PaymentType|PaymentTypeID|
+-----------+-------------+
|Credit Card|            1|
|       Cash|            2|
|  No Charge|            3|
|    Dispute|            4|
|    Unknown|            5|
|Voided Trip|            6|
+-----------+-------------+



In [26]:
yellowTaxiDf = (
    spark
    .read
    .option("header", "true")
    .option("inferschema", "true")
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)
yellowTaxiDf.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [27]:
# Create schema for Yellow Taxi Data
 
taxiSchema = (
                    StructType
                    ([ 
                        StructField("VendorId"               , IntegerType()   , True),
                        StructField("lpep_pickup_datetime"   , TimestampType() , True),
                        StructField("lpep_dropoff_datetime"  , TimestampType() , True),                            
                        StructField("passenger_count"        , DoubleType()    , True),
                        StructField("trip_distance"          , DoubleType()    , True),
                        StructField("RatecodeID"             , DoubleType()    , True),                            
                        StructField("store_and_fwd_flag"     , StringType()    , True),
                        StructField("PULocationID"           , IntegerType()   , True),
                        StructField("DOLocationID"           , IntegerType()   , True),                            
                        StructField("payment_type"           , IntegerType()   , True),                            
                        StructField("fare_amount"            , DoubleType()    , True),
                        StructField("extra"                  , DoubleType()    , True),
                        StructField("mta_tax"                , DoubleType()    , True),
                        StructField("tip_amount"             , DoubleType()    , True),
                        StructField("tolls_amount"           , DoubleType()    , True),
                        StructField("improvement_surcharge"  , DoubleType()    , True),
                        StructField("total_amount"           , DoubleType()    , True),
                        StructField("congestion_surcharge"   , DoubleType()    , True),
                        StructField("airport_fee"            , DoubleType()    , True)
                    ])
               )

In [28]:
yellowTaxiDf = (
    spark
    .read
    .option("header", "true")
    .schema(taxiSchema)
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)
yellowTaxiDf.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [30]:
#Read nested json

taxiBasesDf = (
    spark
    .read
    .option("multiline", "true")
    .json("C:\DataFiles\TaxiBases.json")
)
taxiBasesDf.show(truncate = False)


+-----------------------------------------------------------+----------+--------------------------------------+------------------------------------------------+--------------+------------+----------------+--------+---------------------------+
|Address                                                    |Date      |Entity Name                           |GeoLocation                                     |License Number|SHL Endorsed|Telephone Number|Time    |Type of Base               |
+-----------------------------------------------------------+----------+--------------------------------------+------------------------------------------------+--------------+------------+----------------+--------+---------------------------+
|{636, NEW YORK, 10001, NY, WEST   28 STREET}               |08/15/2019|VIER-NY,LLC                           |{40.75273, (40.75273, -74.006408), -74.006408}  |B02865        |No          |6466657536      |18:03:31|BLACK CAR BASE             |
|{131, BRONX, 10468, NY, KIN

In [32]:
taxiBasesSchema = (
                    StructType
                    ([
                        StructField("License Number"         , StringType()    , True),
                        StructField("Entity Name"            , StringType()    , True),
                        StructField("Telephone Number"       , LongType()      , True),
                        StructField("SHL Endorsed"           , StringType()    , True),
                        StructField("Type of Base"           , StringType()    , True),
 
                        StructField("Address", 
                                        StructType
                                        ([
                                            StructField("Building"   , StringType(),   True),
                                            StructField("Street"     , StringType(),   True), 
                                            StructField("City"       , StringType(),   True), 
                                            StructField("State"      , StringType(),   True), 
                                            StructField("Postcode"   , StringType(),   True)
                                        ]),
                                    True
                                   ),
 
                        StructField("GeoLocation", 
                                        StructType
                                        ([
                                            StructField("Latitude"   , StringType(),   True),
                                            StructField("Longitude"  , StringType(),   True), 
                                            StructField("Location"   , StringType(),   True)
                                        ]),
                                    True
                                   )  
                  ])
                )


In [33]:
#Read nested json

taxiBasesDf = (
    spark
    .read
    .option("multiline", "true")
    .schema(taxiBasesSchema)
    .json("C:\DataFiles\TaxiBases.json")
)
taxiBasesDf.show(truncate = False)


+--------------+--------------------------------------+----------------+------------+---------------------------+-----------------------------------------------------------+------------------------------------------------+
|License Number|Entity Name                           |Telephone Number|SHL Endorsed|Type of Base               |Address                                                    |GeoLocation                                     |
+--------------+--------------------------------------+----------------+------------+---------------------------+-----------------------------------------------------------+------------------------------------------------+
|B02865        |VIER-NY,LLC                           |6466657536      |No          |BLACK CAR BASE             |{636, WEST   28 STREET, NEW YORK, NY, 10001}               |{40.75273, -74.006408, (40.75273, -74.006408)}  |
|B02634        |VETERANS RADIO DISPATCHER CORP.       |7183647878      |No          |LIVERY BASE            

In [34]:
yellowTaxiAnalyzedDf = (
    yellowTaxiDf.describe
    (
        "passenger_count",
        "trip_distance"
    )
)
yellowTaxiAnalyzedDf.show()

+-------+------------------+-----------------+
|summary|   passenger_count|    trip_distance|
+-------+------------------+-----------------+
|  count|           3542392|          3675412|
|   mean|1.3846934500755421|6.206976298167358|
| stddev|0.9302303297407405|640.8236808320215|
|    min|               0.0|              0.0|
|    max|               9.0|        389678.46|
+-------+------------------+-----------------+



In [35]:
print("Before operation = "+ str(yellowTaxiDf.count()))
yellowTaxiDf = (
    yellowTaxiDf
    .where("passenger_count > 0")
    .filter(col("trip_distance")>0.0)
)

print("After operation = " + str(yellowTaxiDf.count()))

Before operation = 3675412
After operation = 3422296


In [38]:
defaultValueMap = {'payment_type':5, 'RateCodeID':1}

yellowTaxiDf = (
    spark
    .read
    .option("header", "true")
    .schema(taxiSchema)
    .csv("C:\DataFiles\YellowTaxis_202210.csv") 
)

In [40]:
yellowTaxiDf = (
    yellowTaxiDf
    .where("passenger_count > 0")
    .filter(col("trip_distance")>0.0)
    .na.drop('all')
    .na.fill(defaultValueMap)
    .dropDuplicates()
    .where("lpep_pickup_datetime >= '2022-1-01' AND lpep_dropoff_datetime < '2022-11-01'")
)

print("After operation = "+ str(yellowTaxiDf.count()))


After operation = 3393898


In [45]:
yellowTaxiDf = (
    yellowTaxiDf
    .withColumn("TripYear", year(col("lpep_pickup_datetime")))
    .select(
        "*",
        expr("month(lpep_pickup_datetime) AS TripMonth"),
        dayofmonth(col("lpep_pickup_datetime")).alias("tripDay")
    )
)

yellowTaxiDf.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- TripYear: integer (nullable = true)
 |-- TripMonth: integer (nullable = true)
 |-- tripDay: integer (nullable = true)



In [55]:
timeInSec = unix_timestamp(col("lpep_dropoff_datetime")) - unix_timestamp(col("lpep_pickup_datetime"))
timeInMin = round(timeInSec / 60)

yellowTaxiDf = (
    yellowTaxiDf
    .withColumn("TripTimeInMinutes", timeInMin)
)

yellowTaxiDf.printSchema()


root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- TripYear: integer (nullable = true)
 |-- TripMonth: integer (nullable = true)
 |-- tripDay: integer (nullable = true)
 |-- TripTimeInMinutes: double (nullable = tr

In [54]:
tripTypeColumn = (
    when(
        col("RatecodeID") == 6,
        "SharedTrip"
    )
    .otherwise("SoloTrip")
)

yellowTaxiDf = (
    yellowTaxiDf
    .withColumn("TripType", tripTypeColumn)
    .drop("RatecodeID")

)
yellowTaxiDf.printSchema()


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `RatecodeID` cannot be resolved. Did you mean one of the following? [`VendorId`, `extra`, `mta_tax`, `tripDay`, `TripMonth`].;
'Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407, TripYear#1589, TripMonth#1610, tripDay#1611, TripTimeInMinutes#1733, CASE WHEN ('RatecodeID = 6) THEN SharedTrip ELSE SoloTrip END AS TripType#1758]
+- Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407, TripYear#1589, TripMonth#1610, tripDay#1611, round((cast((unix_timestamp(lpep_dropoff_datetime#1391, yyyy-MM-dd HH:mm:ss, Some(Asia/Calcutta), false) - unix_timestamp(lpep_pickup_datetime#1390, yyyy-MM-dd HH:mm:ss, Some(Asia/Calcutta), false)) as double) / cast(60 as double)), 0) AS TripTimeInMinutes#1733, TripType#1658]
   +- Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407, TripYear#1589, TripMonth#1610, tripDay#1611, round((cast((unix_timestamp(lpep_dropoff_datetime#1391, yyyy-MM-dd HH:mm:ss, Some(Asia/Calcutta), false) - unix_timestamp(lpep_pickup_datetime#1390, yyyy-MM-dd HH:mm:ss, Some(Asia/Calcutta), false)) as double) / cast(60 as double)), 0) AS TripTimeInMinutes#1709, TripType#1658]
      +- Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407, TripYear#1589, TripMonth#1610, tripDay#1611, TripTimeInMinutes#1634, TripType#1658]
         +- Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, RatecodeID#1525, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407, TripYear#1589, TripMonth#1610, tripDay#1611, TripTimeInMinutes#1634, CASE WHEN (RatecodeID#1525 = cast(6 as double)) THEN SharedTrip ELSE SoloTrip END AS TripType#1658]
            +- Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, RatecodeID#1525, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407, TripYear#1589, TripMonth#1610, tripDay#1611, round((cast((unix_timestamp(lpep_dropoff_datetime#1391, yyyy-MM-dd HH:mm:ss, Some(Asia/Calcutta), false) - unix_timestamp(lpep_pickup_datetime#1390, yyyy-MM-dd HH:mm:ss, Some(Asia/Calcutta), false)) as double) / cast(60 as double)), 0) AS TripTimeInMinutes#1634]
               +- Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, RatecodeID#1525, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407, TripYear#1589, month(cast(lpep_pickup_datetime#1390 as date)) AS TripMonth#1610, dayofmonth(cast(lpep_pickup_datetime#1390 as date)) AS tripDay#1611]
                  +- Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, RatecodeID#1525, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407, year(cast(lpep_pickup_datetime#1390 as date)) AS TripYear#1589]
                     +- Filter ((lpep_pickup_datetime#1390 >= cast(2022-1-01 as timestamp)) AND (lpep_dropoff_datetime#1391 < cast(2022-11-01 as timestamp)))
                        +- Deduplicate [lpep_dropoff_datetime#1391, DOLocationID#1397, improvement_surcharge#1404, PULocationID#1396, trip_distance#1393, tolls_amount#1403, RatecodeID#1525, VendorId#1389, tip_amount#1402, payment_type#1526, fare_amount#1399, lpep_pickup_datetime#1390, passenger_count#1392, store_and_fwd_flag#1395, extra#1400, airport_fee#1407, congestion_surcharge#1406, total_amount#1405, mta_tax#1401]
                           +- Project [VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, coalesce(nanvl(RatecodeID#1394, cast(null as double)), cast(1 as double)) AS RatecodeID#1525, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, coalesce(payment_type#1398, cast(5 as int)) AS payment_type#1526, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407]
                              +- Filter atleastnnonnulls(1, VendorId#1389, lpep_pickup_datetime#1390, lpep_dropoff_datetime#1391, passenger_count#1392, trip_distance#1393, RatecodeID#1394, store_and_fwd_flag#1395, PULocationID#1396, DOLocationID#1397, payment_type#1398, fare_amount#1399, extra#1400, mta_tax#1401, tip_amount#1402, tolls_amount#1403, improvement_surcharge#1404, total_amount#1405, congestion_surcharge#1406, airport_fee#1407)
                                 +- Filter (trip_distance#1393 > 0.0)
                                    +- Filter (passenger_count#1392 > cast(0 as double))
                                       +- Relation [VendorId#1389,lpep_pickup_datetime#1390,lpep_dropoff_datetime#1391,passenger_count#1392,trip_distance#1393,RatecodeID#1394,store_and_fwd_flag#1395,PULocationID#1396,DOLocationID#1397,payment_type#1398,fare_amount#1399,extra#1400,mta_tax#1401,tip_amount#1402,tolls_amount#1403,improvement_surcharge#1404,total_amount#1405,congestion_surcharge#1406,airport_fee#1407] csv


In [58]:
yellowTaxiDf.select(yellowTaxiDf.columns[20:]).show(5)

+-------+-----------------+--------+
|tripDay|TripTimeInMinutes|TripType|
+-------+-----------------+--------+
|      1|             16.0|SoloTrip|
|      1|             12.0|SoloTrip|
|      1|              5.0|SoloTrip|
|      1|              3.0|SoloTrip|
|      1|              7.0|SoloTrip|
+-------+-----------------+--------+
only showing top 5 rows



In [5]:
yellowTaxiDf = yellowTaxiDf.coalesce(4)
yellowTaxiDf.rdd.getNumPartitions()

4

In [6]:
(
    yellowTaxiDf
    .write
    .option("header", "true")
    .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    .mode("overwrite")
    .csv("C:\DataFiles\YellowTaxisOutput.csv")
)