# Análise exploratória para definição do schema de dados na landing zone

Este notebook tem como objetivo explorar e compreender a estrutura dos dados armazenados na landing zone.
Todos as bases apresentaram problemas em colunas do tipo "inteiro": alguns arquivos apresentavam INT64 no parquet, enquanto outros apresentavam INT32. Para solucionar o problema, o schema definido para ler cada origem está usando LongType.

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T

SOURCE_BUCKET = "s3://bucket-landing-zone-241963575180"

spark = SparkSession.builder.appName("Exploração de Dados - Landing Zone").getOrCreate()


# Estrutura da Base de Dados nyc_taxi_data_yellow

O schema foi desenvolvido com base no seguinte dicionário de dados: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [2]:
schema = T.StructType([
    T.StructField("VendorID", T.LongType(), True),
    T.StructField("tpep_pickup_datetime", T.TimestampType(), True),
    T.StructField("tpep_dropoff_datetime", T.TimestampType(), True),
    T.StructField("passenger_count", T.LongType(), True),
    T.StructField("trip_distance", T.DoubleType(), True),
    T.StructField("RatecodeID", T.LongType(), True),
    T.StructField("store_and_fwd_flag", T.StringType(), True),
    T.StructField("PULocationID", T.LongType(), True),
    T.StructField("DOLocationID", T.LongType(), True),
    T.StructField("payment_type", T.LongType(), True),
    T.StructField("fare_amount", T.DoubleType(), True),
    T.StructField("extra", T.DoubleType(), True),
    T.StructField("mta_tax", T.DoubleType(), True),
    T.StructField("tip_amount", T.DoubleType(), True),
    T.StructField("tolls_amount", T.DoubleType(), True),
    T.StructField("improvement_surcharge", T.DoubleType(), True),
    T.StructField("total_amount", T.DoubleType(), True),
    T.StructField("congestion_surcharge", T.DoubleType(), True),
    T.StructField("airport_fee", T.DoubleType(), True)
])

df = spark.read.schema(schema).parquet("s3://bucket-landing-zone-241963575180/nyc_taxi_data_yellow/")
df.printSchema()
df.show()


root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- ano_mes_referencia: string (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+---------

# Estrutura da Base de Dados nyc_taxi_data_green

O schema foi desenvolvido com base no seguinte dicionário de dados: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [3]:
schema = T.StructType([
    T.StructField("VendorID", T.LongType(), True),
    T.StructField("lpep_pickup_datetime", T.TimestampType(), True),
    T.StructField("lpep_dropoff_datetime", T.TimestampType(), True),
    T.StructField("store_and_fwd_flag", T.StringType(), True),
    T.StructField("RatecodeID", T.LongType(), True),
    T.StructField("PULocationID", T.LongType(), True),
    T.StructField("DOLocationID", T.LongType(), True),
    T.StructField("passenger_count", T.LongType(), True),
    T.StructField("trip_distance", T.DoubleType(), True),
    T.StructField("fare_amount", T.DoubleType(), True),
    T.StructField("extra", T.DoubleType(), True),
    T.StructField("mta_tax", T.DoubleType(), True),
    T.StructField("tip_amount", T.DoubleType(), True),
    T.StructField("tolls_amount", T.DoubleType(), True),
    T.StructField("improvement_surcharge", T.DoubleType(), True),
    T.StructField("total_amount", T.DoubleType(), True),
    T.StructField("payment_type", T.LongType(), True),
    T.StructField("trip_type", T.LongType(), True),
    T.StructField("congestion_surcharge", T.DoubleType(), True)
])

df = spark.read.schema(schema).parquet("s3://bucket-landing-zone-241963575180/nyc_taxi_data_green/")
df.printSchema()
df.show()


root
 |-- VendorID: long (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: long (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- ano_mes_referencia: string (nullable = true)

+--------+--------------------+---------------------+------------------+----------+------------+

# Estrutura da Base de Dados nyc_taxi_data_forhire

O schema foi desenvolvido com base no seguinte dicionário de dados: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf

In [4]:
schema = T.StructType([
    T.StructField("dispatching_base_num", T.StringType(), True),
    T.StructField("pickup_datetime", T.TimestampType(), True),
    T.StructField("dropOff_datetime", T.TimestampType(), True),
    T.StructField("PUlocationID", T.LongType(), True),
    T.StructField("DOlocationID", T.LongType(), True),
    T.StructField("SR_Flag", T.LongType(), True),
    T.StructField("Affiliated_base_number", T.StringType(), True)
])

df = spark.read.schema(schema).parquet("s3://bucket-landing-zone-241963575180/nyc_taxi_data_forhire/")
df.printSchema()
df.show()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: long (nullable = true)
 |-- DOlocationID: long (nullable = true)
 |-- SR_Flag: long (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)
 |-- ano_mes_referencia: string (nullable = true)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|ano_mes_referencia|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+------------------+
|              B00001|2023-03-01 00:24:00|2023-03-01 02:35:00|        NULL|        NULL|   NULL|                B00001|           2023-03|
|              B00014|2023-03-01 00:45:00|2023-03-01 01:24:0

# Estrutura da Base de Dados nyc_taxi_data_highvolumeforhire

O schema foi desenvolvido com base no seguinte dicionário de dados: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_hvfhs.pdf

In [6]:
schema = T.StructType([
    T.StructField("hvfhs_license_num", T.StringType(), True),
    T.StructField("dispatching_base_num", T.StringType(), True),
    T.StructField("originating_base_num", T.StringType(), True),
    T.StructField("request_datetime", T.TimestampType(), True),
    T.StructField("on_scene_datetime", T.TimestampType(), True),
    T.StructField("pickup_datetime", T.TimestampType(), True),
    T.StructField("dropoff_datetime", T.TimestampType(), True),
    T.StructField("PULocationID", T.LongType(), True),
    T.StructField("DOLocationID", T.LongType(), True),
    T.StructField("trip_miles", T.DoubleType(), True),
    T.StructField("trip_time", T.LongType(), True),
    T.StructField("base_passenger_fare", T.DoubleType(), True),
    T.StructField("tolls", T.DoubleType(), True),
    T.StructField("bcf", T.DoubleType(), True),
    T.StructField("sales_tax", T.DoubleType(), True),
    T.StructField("congestion_surcharge", T.DoubleType(), True),
    T.StructField("airport_fee", T.DoubleType(), True),
    T.StructField("tips", T.DoubleType(), True),
    T.StructField("driver_pay", T.DoubleType(), True),
    T.StructField("shared_request_flag", T.StringType(), True),
    T.StructField("shared_match_flag", T.StringType(), True),
    T.StructField("access_a_ride_flag", T.StringType(), True),
    T.StructField("wav_request_flag", T.StringType(), True),
    T.StructField("wav_match_flag", T.StringType(), True)
])

df = spark.read.schema(schema).parquet("s3://bucket-landing-zone-241963575180/nyc_taxi_data_highvolumeforhire/")
df.printSchema()
df.show()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp (nullable = true)
 |-- on_scene_datetime: timestamp (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nul