In [34]:
#----------------------------------------------------------------------------------------------------------#
# Local application imports
from data_integration.aws.s3 import s3_Session

#----------------------------------------------------------------------------------------------------------#
# Third-party libraries
import pandas           as pd
import polars           as pl
import pyarrow.parquet  as pq

#----------------------------------------------------------------------------------------------------------#
# Standard library
from io import BytesIO


In [35]:
data_boston_files = [
    {
        'url':        r"https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/c9509ab4-6f6d-4b97-979a-0cf2a10c922b/download/tmphrybkxuh.csv",
        'filename':   r"data/data_2015.csv",
        'year':       2015
    },
    {
        'url':        r"https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/b7ea6b1b-3ca4-4c5b-9713-6dc1db52379a/download/tmpzxzxeqfb.csv",
        'filename':   r"data/data_2016.csv",
        'year':       2016
    },
    {
        'url':        r"https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/30022137-709d-465e-baae-ca155b51927d/download/tmpzccn8u4q.csv",
        'filename':   r"data/data_2017.csv",
        'year':       2017
    },
    {
        'url':        r"https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2be28d90-3a90-4af1-a3f6-f28c1e25880a/download/tmp7602cia8.csv",
        'filename':   r"data/data_2018.csv",
        'year':       2018
    },
    {
        'url':        r"https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/ea2e4696-4a2d-429c-9807-d02eb92e0222/download/tmpcje3ep_w.csv",
        'filename':   r"data/data_2019.csv",
        'year':       2019
    },
    {
        'url':        r"https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/6ff6a6fd-3141-4440-a880-6f60a37fe789/download/tmpcv_10m2s.csv",
        'filename':   r"data/data_2020.csv",
        'year':       2020
    },                    
]

In [36]:
dfs = {}
for item in data_boston_files:
    year        = item['year']
    filename    = item['filename']

    dfs[year] = pl.read_csv(filename,infer_schema_length=10000)

In [37]:
bucket_s3 = 'datalake-01-lrodrigues'
s3_client = s3_Session()

In [38]:
for ano, df in dfs.items():

# Converter Polars para Arrow Table
    arrow_table = df.to_arrow()

    # Escrever Parquet no buffer
    with BytesIO() as file_parquet_buffer:
        pq.write_table(arrow_table, file_parquet_buffer)
        file_parquet_buffer.seek(0)

        s3_client.upload_fileobj(
            Fileobj = file_parquet_buffer,
            Bucket  = bucket_s3,
            Key     = f'bronze/dados_{ano}.parquet'
        )

