In [104]:
import io
import os
import timeit
import pyarrow.parquet as pq
from statistics import median
from sys import getsizeof

In [89]:
table = pq.read_table('~/Downloads/yellow_tripdata_2022-06.parquet')
display(table.schema)
meta = pq.read_metadata('~/Downloads/yellow_tripdata_2022-06.parquet')

VendorID: int64
tpep_pickup_datetime: timestamp[us]
tpep_dropoff_datetime: timestamp[us]
passenger_count: double
trip_distance: double
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 2492

#### Q1 - In each variable, you will notice that the data is stored as sub-lists. Why are values in each variable stored in sub-lists?
* Hint: check the type of each of variable and consult the relevant documentation.
    * A: Sublists represent chunk, this way data can be loaded chunkwise with the data being ordered in a way that allows each chunk to be ~maximally compressible 
* How many sub-lists are there?
    * A: 28 sublists
* How many elements are in each sublist?
    * A: 131072 elemets per sublist, except for the last one (since total number of records is not divisible by 131072)
* How many records does the file contain?
    * A: 3558124

In [90]:
vnames = table.schema.names
print(f'Total number of records: {table.num_rows}')
for v in vnames:
    print(f'Variable: {v} - {len(table[v].chunks)} sublists, {int(median([len(sublist) for sublist in table[v].chunks]))} median elements')

Total number of records: 3558124
Variable: VendorID - 28 sublists, 131072 median elements
Variable: tpep_pickup_datetime - 28 sublists, 131072 median elements
Variable: tpep_dropoff_datetime - 28 sublists, 131072 median elements
Variable: passenger_count - 28 sublists, 131072 median elements
Variable: trip_distance - 28 sublists, 131072 median elements
Variable: RatecodeID - 28 sublists, 131072 median elements
Variable: store_and_fwd_flag - 28 sublists, 131072 median elements
Variable: PULocationID - 28 sublists, 131072 median elements
Variable: DOLocationID - 28 sublists, 131072 median elements
Variable: payment_type - 28 sublists, 131072 median elements
Variable: fare_amount - 28 sublists, 131072 median elements
Variable: extra - 28 sublists, 131072 median elements
Variable: mta_tax - 28 sublists, 131072 median elements
Variable: tip_amount - 28 sublists, 131072 median elements
Variable: tolls_amount - 28 sublists, 131072 median elements
Variable: improvement_surcharge - 28 sublists,

#### Q2: What the time of the first and the last observations?
* A: (Based on pickup time, not dropoff) 
    * First observation is on June 1st, 00:25:41am
    * Last observation is on June 30th, 11:33:53pm

In [91]:
print(str(table['tpep_pickup_datetime'][0]))
print(str(table['tpep_pickup_datetime'][-1]))

2022-06-01 00:25:41
2022-06-30 23:33:53


#### Q3: What payment types values are there?
* A: 0,1,2,3,4

In [92]:
table['payment_type'].unique()

<pyarrow.lib.Int64Array object at 0x12f915240>
[
  1,
  2,
  3,
  4,
  0
]

#### Q4: Use the following compression schemes to write the table you just read snappy, gzip, brotli, lz4, and gzip
* Which compression algorithm provides the smallest file size?
    * A: brotli
* Which compression algorithm provides the best compression time?
    * A: lz4

In [113]:
def compress_table(comp):
    with io.BytesIO() as f:
        pq.write_table(table, f, compression=comp)
        print(f'{comp} file size: {(f.getbuffer().nbytes / (1<<20)):.2f} megabytes.')

In [117]:
for comp in ['snappy', 'gzip', 'brotli', 'lz4', 'gzip']:
   test = lambda: compress_table(comp=comp) 
   print(f'{comp} compression time: {timeit.timeit(test, number=1):.2f} seconds')
   print('-'*50)

snappy file size: 69.64 megabytes.
snappy compression time: 0.89 seconds
--------------------------------------------------
gzip file size: 52.80 megabytes.
gzip compression time: 6.06 seconds
--------------------------------------------------
brotli file size: 50.32 megabytes.
brotli compression time: 5.31 seconds
--------------------------------------------------
lz4 file size: 69.62 megabytes.
lz4 compression time: 0.86 seconds
--------------------------------------------------
gzip file size: 52.80 megabytes.
gzip compression time: 6.07 seconds
--------------------------------------------------
