This notebook prepares the data for future ingestion into eventstore by:
1. fetch raster data from s3
2. compute the bounds of the raster image on-the-fly
3. store such meta data as dataframe in parquet format

In [3]:
! pip install pyrip

Collecting pyrip
  Using cached https://files.pythonhosted.org/packages/3b/40/4d439c151a1c52719cde3fc690ad71c9e17ef6f4b723408cfa402a4e2b80/pyrip-1.1.7-py3-none-any.whl
Collecting pandas (from pyrip)
  Using cached https://files.pythonhosted.org/packages/bb/71/8f53bdbcbc67c912b888b40def255767e475402e9df64050019149b1a943/pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl
Collecting rasterio (from pyrip)
  Using cached https://files.pythonhosted.org/packages/c7/81/13321f88f582a00705c5f348724728e8999136e19d6e7c56f7e6ac9bb7f9/rasterio-1.1.3-cp36-cp36m-manylinux1_x86_64.whl
Collecting six (from pyrip)
  Using cached https://files.pythonhosted.org/packages/65/eb/1f97cb97bfc2390a276969c6fae16075da282f5058082d4cb10c6c5c1dba/six-1.14.0-py2.py3-none-any.whl
Collecting numpy (from pyrip)
  Using cached https://files.pythonhosted.org/packages/07/08/a549ba8b061005bb629b76adc000f3caaaf881028b963c2e18f811c6edc1/numpy-1.18.2-cp36-cp36m-manylinux1_x86_64.whl
Collecting pytz>=2017.2 (from pandas->pyrip)
  Usi

In [4]:
import boto3
import pandas as pd
import tempfile
from pyrip.image import get_bounds

In [5]:
def get_all_keys(bucket, prefix='', suffix=''):
    kwargs = {'Bucket': bucket, 'Prefix': prefix}
    while True:
        resp = s3.list_objects_v2(**kwargs)
        try:
            contents = resp['Contents']
        except KeyError:
            return
        for obj in contents:
            key = obj['Key']
            if key.endswith(suffix):
                yield key
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [6]:
ACCESS_KEY_ID='<AWS_S3_ACCESS>'
SECRET_KEY_ID='<AWS_S3_SECRET>'

bucket = 'vz-raster-images'
prefix = 'sample_data'

In [7]:
s3 = boto3.client('s3', aws_access_key_id=ACCESS_KEY_ID, aws_secret_access_key=SECRET_KEY_ID)

In [8]:
data = []
for key in get_all_keys(bucket, prefix):
    url = os.path.join(bucket, key)
    bounds = get_bounds(s3.get_object(Bucket=bucket, Key=key)['Body'].read())
    data.append({'url': url, 'llat': bounds.bottom, 'llon': bounds.left, 'ulat': bounds.top, 'ulon': bounds.right})
df = pd.DataFrame(data)

In [9]:
df

Unnamed: 0,llat,llon,ulat,ulon,url
0,43.529023,-128.471366,43.881724,-128.118664,vz-raster-images/sample_data/LC08_CU_000004_20...
1,43.176322,-128.471366,43.529023,-128.118664,vz-raster-images/sample_data/LC08_CU_000004_20...
2,42.823620,-128.471366,43.176322,-128.118664,vz-raster-images/sample_data/LC08_CU_000004_20...
3,42.470919,-128.471366,42.823620,-128.118664,vz-raster-images/sample_data/LC08_CU_000004_20...
4,42.176766,-128.471366,42.470919,-128.118664,vz-raster-images/sample_data/LC08_CU_000004_20...
5,43.529023,-128.118664,43.881724,-127.765963,vz-raster-images/sample_data/LC08_CU_000004_20...
6,43.176322,-128.118664,43.529023,-127.765963,vz-raster-images/sample_data/LC08_CU_000004_20...
7,42.823620,-128.118664,43.176322,-127.765963,vz-raster-images/sample_data/LC08_CU_000004_20...
8,42.470919,-128.118664,42.823620,-127.765963,vz-raster-images/sample_data/LC08_CU_000004_20...
9,42.176766,-128.118664,42.470919,-127.765963,vz-raster-images/sample_data/LC08_CU_000004_20...


In [10]:
len(df)

2245

In [11]:
with tempfile.NamedTemporaryFile(suffix='.snappy.parquet') as f:
    df.to_parquet(f.name, engine='pyarrow', compression='snappy', index=False)
    s3.upload_file(f.name, bucket, 'raster_meta/raster_meta.snappy.parquet')

In [13]:
df.to_csv('raster_meta.csv',index=False)