In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [2]:
from pyspark.sql import types

In [32]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True), 
    types.StructField('pickup_datetime', types.TimestampType(), True), 
    types.StructField('dropoff_datetime', types.TimestampType(), True), 
    types.StructField('PULocationID', types.IntegerType(), True), 
    types.StructField('DOLocationID', types.IntegerType(), True), 
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True) 
    ])

In [30]:
df_raw_test = spark \
            .read \
            .options(header=True) \
            .csv('fhv_tripdata_2019-10.csv')

In [31]:
df_raw_test.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropOff_datetime: string (nullable = true)
 |-- PUlocationID: string (nullable = true)
 |-- DOlocationID: string (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [33]:
df_raw = spark \
            .read \
            .options(header=True) \
            .schema(schema) \
            .csv('fhv_tripdata_2019-10.csv')

In [34]:
df_raw.count()

1897493

In [35]:
df_raw.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [36]:
df_raw \
    .repartition(6) \
    .write \
    .parquet('fhv_tripdata_2019-10', mode='overwrite')

In [37]:
df_parquet = spark \
                .read \
                .parquet('fhv_tripdata_2019-10')

In [38]:
df_parquet.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [67]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col 
from datetime import datetime, timedelta
from pyspark.sql import udf

In [63]:
def convert_interval_time(duration):
    if duration is None:
        return None
    return duration / timedelta(hours=1)
convert_interval_time_udf = F.udf(convert_interval_time, returnType=types.TimestampType())

In [62]:
convert_interval_time((datetime(year=2019, day=15, month=10) - datetime(year=2019, day=11, month=10)))

96.0

In [65]:
df_processing = df_parquet \
                .withColumn('pickup_date', F.to_date(df_parquet.pickup_datetime)) \
                .withColumn('dropoff_date', F.to_date(df_parquet.dropoff_datetime)) \
                .withColumn('length_trips', convert_interval_time_udf(df_parquet.dropoff_datetime - df_parquet.pickup_datetime))
                # .withColumn('length_trips_hours', convert_interval_time_udf(df_parquet.length_trips))

In [70]:
df_processing = df_parquet \
                .withColumn('pickup_date', F.to_date(df_parquet.pickup_datetime)) \
                .withColumn('dropoff_date', F.to_date(df_parquet.dropoff_datetime)) \
                .withColumn('length_trips', df_parquet.dropoff_datetime - df_parquet.pickup_datetime)
                # .withColumn('length_trips_hours', convert_interval_time_udf(df_parquet.length_trips))
df_processing = df_processing \
                .withColumn('hours_trips', convert_interval_time_udf(col('length_trips')))                

In [73]:
df_processing.show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-----------+------------+--------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|pickup_date|dropoff_date|        length_trips|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-----------+------------+--------------------+
|     B00889         |2019-10-01 15:38:29|2019-10-01 15:56:54|         129|          92|   null|       B00889         | 2019-10-01|  2019-10-01|INTERVAL '0 00:18...|
|              B01239|2019-10-02 10:17:37|2019-10-02 10:33:46|         264|         241|   null|                B01239| 2019-10-02|  2019-10-02|INTERVAL '0 00:16...|
|              B01745|2019-10-01 14:07:24|2019-10-01 14:19:02|         264|         215|   null|                B01745| 2019-10-01|  2019-10-01|INTERVAL '0 00:11...|
|   

In [41]:
date_ref = datetime(year=2019, day=15, month=10)

96.0

In [42]:
df_processing.filter(df_processing.pickup_date == date_ref).count()

62610

In [66]:
df_processing.show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 830, in main
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 822, in process
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 225, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 146, in dump_stream
    for obj in iterator:
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 214, in _batched
    for item in iterator:
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 653, in mapper
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 653, in <genexpr>
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 101, in <lambda>
  File "C:\tools\spark-3.4.3-bin-hadoop3\python\lib\pyspark.zip\pyspark\sql\types.py", line 273, in toInternal
    calendar.timegm(dt.utctimetuple()) if dt.tzinfo else time.mktime(dt.timetuple())
AttributeError: 'float' object has no attribute 'tzinfo'
