### Creating spark session

Formatting and saving high volume vehicle files (~30 GB) only - the same settings as for yellow_taxi (~20 GB)

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *


spark = SparkSession.builder.appName("highvolume") \
.master("local[*]") \
#.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "8gb") \
.getOrCreate()


### Data formatting

Loading each year of high volume vehicles and checking column names/formats against sample file

In [105]:
df = (spark.read
        .format("parquet")
        .load("data/taxi/high_volume_for_hire_vehicle/2019/*"))

df_sample = (spark.read
        .format("parquet")
        .load("data/taxi/sample/sample_HV.parquet"))

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [81]:
df

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
HV0003,B02867,B02867,2019-02-01 00:01:26,2019-02-01 00:02:55,2019-02-01 00:05:18,2019-02-01 00:14:57,245,251,2.45,579,9.35,0.0,0.23,0.83,0.0,,0.0,7.48,Y,N,N,N,
HV0003,B02879,B02879,2019-02-01 00:26:08,2019-02-01 00:41:29,2019-02-01 00:41:29,2019-02-01 00:49:39,216,197,1.71,490,7.91,0.0,0.2,0.7,0.0,,2.0,7.93,N,N,N,N,
HV0005,B02510,,2019-02-01 00:48:58,,2019-02-01 00:51:34,2019-02-01 01:28:29,261,234,5.01,2159,44.96,0.0,1.12,3.99,0.0,,0.0,35.97,N,Y,N,N,
HV0005,B02510,,2019-02-01 00:02:15,,2019-02-01 00:03:51,2019-02-01 00:07:16,87,87,0.34,179,7.19,0.0,0.18,0.64,0.0,,3.0,5.39,N,Y,N,N,
HV0005,B02510,,2019-02-01 00:06:17,,2019-02-01 00:09:44,2019-02-01 00:39:56,87,198,6.84,1799,24.25,0.11,0.61,2.16,0.0,,4.0,17.07,N,Y,N,N,
HV0005,B02510,,2019-02-01 00:56:01,,2019-02-01 00:59:55,2019-02-01 01:06:28,198,198,1.11,359,5.75,0.0,0.14,0.51,0.0,,0.0,0.0,Y,Y,N,N,
HV0005,B02510,,2019-02-01 00:07:17,,2019-02-01 00:12:06,2019-02-01 00:42:13,161,148,4.53,1799,16.39,0.0,0.41,1.45,0.0,,0.0,14.31,N,Y,N,N,
HV0005,B02510,,2019-02-01 00:43:33,,2019-02-01 00:45:35,2019-02-01 01:14:56,148,21,11.24,1739,29.77,0.72,0.76,2.71,0.0,,0.0,22.09,N,Y,N,N,
HV0003,B02867,B02867,2019-02-01 00:00:35,2019-02-01 00:09:33,2019-02-01 00:10:48,2019-02-01 00:20:23,226,260,1.59,574,6.99,0.0,0.17,0.62,0.0,,0.0,6.51,Y,N,N,N,
HV0003,B02867,B02867,2019-02-01 00:29:16,2019-02-01 00:31:21,2019-02-01 00:32:32,2019-02-01 00:40:25,7,223,1.9,474,7.05,0.0,0.18,0.63,0.0,,0.0,6.01,Y,N,N,N,


Comparing schemas

In [107]:
def compare_schemas(df_sample, df):
    if df_sample.schema == df.schema:
        print("Schematy są IDENTYCZNE")
    else:
        print("RÓŻNICE W SCHEMATACH:")
        print("=" * 60)
       
        s1 = {f.name: (str(f.dataType), f.nullable) for f in df_sample.schema.fields}
        s2 = {f.name: (str(f.dataType), f.nullable) for f in df.schema.fields}
       
        all_cols = sorted(set(s1.keys()) | set(s2.keys()))
       
        for col in all_cols:
            in_df_sample = col in s1
            in_df = col in s2
           
            if not in_df_sample:
                print(f" tylko w df → {col:25} {s2[col][0]:20} (nullable={s2[col][1]})")
            elif not in_df:
                print(f" tylko w df_sample → {col:25} {s1[col][0]:20} (nullable={s1[col][1]})")
            elif s1[col] != s2[col]:
                print(f" RÓŻNICA → {col:25} df_sample: {s1[col][0]:15} (null={s1[col][1]}) | "
                      f"df: {s2[col][0]:15} (null={s2[col][1]})")
                
compare_schemas(df_sample, df)

RÓŻNICE W SCHEMATACH:
 tylko w df_sample → shared_match_flag         StringType()         (nullable=True)
 tylko w df_sample → shared_request_flag       StringType()         (nullable=True)


Data casting and formatting

In [106]:
df = df \
.withColumn("PULocationID", col("PULocationID").cast("integer")) \
.withColumn("DOLocationID", col("DOLocationID").cast("integer")) \
.withColumn("trip_time", col("trip_time").cast("long")) \
.withColumn("airport_fee", col("airport_fee").cast("float")) \
.drop("access_a_ride_flag") \
.drop("wav_request_flag") \
.drop("wav_match_flag") \
.drop("shared_request_flag") \
.drop("shared_match_flag")

Checking schema after transformation

In [56]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: float (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)



Simple count to make sure it does not crush after transformation

In [108]:
df.groupBy("driver_pay") \
  .count() \
  .show(20)

+----------+-----+
|driver_pay|count|
+----------+-----+
|   1929.62|    1|
|   1581.72|    1|
|   1282.21|    1|
|   1255.13|    1|
|   1141.08|    1|
|    1077.4|    1|
|   1024.96|    1|
|    975.64|    1|
|    918.91|    1|
|    918.17|    1|
|    911.97|    1|
|    908.69|    1|
|    894.59|    1|
|    891.63|    1|
|    887.41|    1|
|    882.49|    1|
|     880.6|    1|
|    872.75|    1|
|    868.59|    1|
|    862.03|    1|
+----------+-----+
only showing top 20 rows



If everything works fine, data can be saved to parquet

In [109]:
(df
.write
.mode("overwrite")
.option("parquet.compression", "gzip")
.parquet("data/taxi/high_volume_for_hire_vehicle/2019/formatted"))

#### For analytics please check '05_analytics' notebook