# Prerequisite

api key for r/w to s3.

or seft-host s3 compatible service like ceph, or minio

## Basic connection with s3fs

It honors `~/.aws/credentials`, we can access key, private key information inside.

```
[local]
aws_access_key_id=****
aws_secret_access_key=****
```

[ref](https://janakiev.com/blog/pandas-pyarrow-parquet-s3/)

In [22]:
import s3fs

In [28]:
fs = s3fs.S3FileSystem(profile='local', client_kwargs={
    'endpoint_url': 'http://192.168.46.220:9000'
})
fs.ls('/ny-taxi')

['ny-taxi/green_yellow_fact', 'ny-taxi/raw', 'ny-taxi/revenue']

## Read CSV file

In [1]:
import pyarrow.csv as pv
import pandas as pd
import s3fs

In [3]:
fs = s3fs.S3FileSystem(profile='local', client_kwargs={
    'endpoint_url': 'http://192.168.46.220:9000'
})

s3_filepath_csv = 's3://test/taxi+_zone_lookup.csv'
with fs.open(s3_filepath_csv) as f:
    df = pd.read_csv(f)

    ## or use pyarrow instead
    # df = pv.read_csv(f).to_pandas()
df.head()

Unnamed: 0,locationid,borough,zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [7]:
# alternative, use storage_options parameter

df = pd.read_csv(s3_filepath_csv, storage_options={
    'profile': 'local',
    'client_kwargs': {
        'endpoint_url': 'http://192.168.46.220:9000'
    }
})
df.head()

Unnamed: 0,locationid,borough,zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


## Read parquet file

In [29]:
import s3fs
import pyarrow.parquet as pq
import pandas as pd

In [38]:

fs = s3fs.S3FileSystem(profile='local', client_kwargs={
    'endpoint_url': 'http://192.168.46.220:9000'
})

s3_filepath = 's3://ny-taxi/revenue/part-00004-35a8cfd3-21cd-4c82-ba24-6a1d2c3d1330-c000.snappy.parquet'
pf = pq.ParquetDataset(s3_filepath, filesystem=fs)
df = pf.read().to_pandas()
df.head()

Unnamed: 0,revenue_zone,revenue_month,service_type,revenue_monthly_fare,revenue_monthly_extra,revenue_monthly_mta_tax,revenue_monthly_tip_amount,revenue_monthly_tolls_amount,revenue_monthly_ehail_fee,revenue_monthly_improvement_surcharge,revenue_monthly_total_amount,revenue_monthly_congestion_surcharge,total_monthly_trips,avg_montly_passenger_count,avg_montly_trip_distance
0,Governor's Island/Ellis Island/Liberty Island,2019-02-28 16:00:00,Yellow,122.0,3.5,5.5,19.1,0.0,0.0,3.3,180.35,25.0,11,1.090909,2.212727
1,Port Richmond,2019-02-28 16:00:00,Green,344.09,2.75,3.5,13.0,46.08,,0.6,410.02,0.0,7,1.0,16.84
2,Astoria,2008-12-31 16:00:00,Green,57.5,2.0,3.5,0.0,0.0,,2.1,65.1,0.0,7,1.0,1.73
3,Washington Heights South,2019-03-31 16:00:00,Yellow,134694.92,4847.75,4132.5,13307.8,2768.1,0.0,2520.9,166334.57,5885.0,8458,1.561506,4.02577
4,South Ozone Park,2019-02-28 16:00:00,Green,39829.39,1039.75,634.0,89.43,965.52,,176.7,42744.49,16.5,1367,1.22168,8.019561


## Write parquet

In [53]:
import s3fs
import pyarrow.parquet as pq
from pyarrow import Table
import pandas as pd

In [44]:
# initialize data of lists.
data = {'Name': ['Tom', 'Nick', 'Krish', 'Jack'],
        'Age': [20, 21, 19, 18]}
  
# Create DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age
0,Tom,20
1,Nick,21
2,Krish,19
3,Jack,18


In [52]:
fs = s3fs.S3FileSystem(profile='local', client_kwargs={
    'endpoint_url': 'http://192.168.46.220:9000'
})

s3_output_file='s3://test/test_write.parquet'
with fs.open(s3_output_file, 'wb') as f:
    df.to_parquet(f)
fs.ls(s3_output_file)

['test/test_write.parquet']

In [76]:
# alternative

pq.write_table(
    Table.from_pandas(df),
    s3_output_file,
    filesystem=fs
)
fs.ls(s3_output_file)

['test/test_write.parquet']

In [78]:
fs.rm(s3_output_file)