In [2]:
!export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
!export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.5-src.zip:$PYTHONPATH"

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types
import pandas as pd
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

### Q1

In [3]:
spark.version

'3.3.1'

### Q2

In [20]:
#download parquet file from website
pd.read_parquet('fhvhv_tripdata_2021-02.parquet').to_csv('fhvhv_tripdata_2021-02.csv')

In [21]:
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

In [58]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('fhvhv_tripdata_2021-02.csv')

In [60]:
df.head()

Row(hvfhs_license_num='0', dispatching_base_num='HV0003', pickup_datetime=None, dropoff_datetime=None, PULocationID=None, DOLocationID=None, SR_Flag='2021-02-01 00:10:40')

In [23]:
df = df.repartition(24)

In [24]:
df.write.parquet('hvfhv/2021/02/')

In [30]:
!ls -lh ./hvfhv/2021/02/

total 154M
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00000-4ae7c224-cc25-4525-9c40-2558569cd55d-c000.snappy.parquet
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00001-4ae7c224-cc25-4525-9c40-2558569cd55d-c000.snappy.parquet
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00002-4ae7c224-cc25-4525-9c40-2558569cd55d-c000.snappy.parquet
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00003-4ae7c224-cc25-4525-9c40-2558569cd55d-c000.snappy.parquet
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00004-4ae7c224-cc25-4525-9c40-2558569cd55d-c000.snappy.parquet
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00005-4ae7c224-cc25-4525-9c40-2558569cd55d-c000.snappy.parquet
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00006-4ae7c224-cc25-4525-9c40-2558569cd55d-c000.snappy.parquet
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00007-4ae7c224-cc25-4525-9c40-2558569cd55d-c000.snappy.parquet
-rw-r--r-- 1 ricardo ricardo 6,5M dez  5 13:59 part-00008-4a

### Q3

In [4]:
df = spark.read.parquet('hvfhv/2021/02/')

In [5]:
df.show()

+-----------------+--------------------+---------------+----------------+------------+------------+-------------------+
|hvfhs_license_num|dispatching_base_num|pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|            SR_Flag|
+-----------------+--------------------+---------------+----------------+------------+------------+-------------------+
|           289858|              HV0003|           null|            null|        null|        null|2021-02-02 13:54:41|
|            24800|              HV0003|           null|            null|        null|        null|2021-02-01 05:36:50|
|           612642|              HV0005|           null|            null|        null|        null|2021-02-03 10:42:40|
|           221766|              HV0003|           null|            null|        null|        null|2021-02-02 09:49:05|
|           373963|              HV0003|           null|            null|        null|        null|2021-02-02 17:00:55|
|           787302|              HV0003|

In [6]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



In [7]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .filter("pickup_date = '2021-02-15'") \
    .count() 

0

### Q4

In [8]:
df \
    .withColumn('duration', df.dropoff_datetime.cast('long') - df.pickup_datetime.cast('long')) \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .groupBy('pickup_date') \
    .max('duration') \
    .orderBy('max(duration)',ascending=False) \
    .show()

+-----------+-------------+
|pickup_date|max(duration)|
+-----------+-------------+
|       null|         null|
+-----------+-------------+



### Q5

In [9]:
df \
    .groupBy('dispatching_base_num') \
    .count() \
    .orderBy('count', ascending=False) \
    .show()

+--------------------+-------+
|dispatching_base_num|  count|
+--------------------+-------+
|              HV0003|8290758|
|              HV0005|3237166|
|              HV0004|  86018|
+--------------------+-------+



In [10]:
df.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag']

### Q6

In [11]:
df_zones = spark.read.parquet('code/zones')

In [12]:
df_zones.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [13]:
df.registerTempTable('fhvhv_2021_02')
df_zones.registerTempTable('zones')



In [23]:
spark.sql("""
SELECT
    CONCAT(pul.Zone, ' / ', dol.Zone) AS location_pair
    COUNT(1)
FROM 
    fhvhv_2021_02 fhv LEFT JOIN zones pul ON fhv.PULocationID = pul.LocationID
                      LEFT JOIN zones dol ON fhv.PULocationID = dol.LocationID
GROUP BY 
  1
ORDER BY
 2 DESC
LIMIT 5;
""").show()

ParseException: 
Syntax error at or near 'COUNT'(line 4, pos 4)

== SQL ==

SELECT
    CONCAT(pul.Zone, ' / ', dol.Zone) AS location_pair
    COUNT(1)
----^^^
FROM 
    fhvhv_2021_02 fhv LEFT JOIN zones pul ON fhv.PULocationID = pul.LocationID
                      LEFT JOIN zones dol ON fhv.PULocationID = dol.LocationID
GROUP BY 
  1
ORDER BY
 2 DESC
LIMIT 5;
