In [22]:
paths 
from proj_paths.paths import Paths

In [23]:
import boto3
import uuid
import os
import pandas as pd

# Working with S3 Buckets

Following tutorial:
https://realpython.com/python-boto3-aws-s3/#installation

# Set up connection

In [24]:
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [25]:
session = boto3.session.Session()

# Create Buckets

In [26]:
def create_bucket_name(bucket_prefix):
    # The generated bucket name must be between 3 and 63 chars long
    return ''.join([bucket_prefix, str(uuid.uuid4())])

In [27]:
def create_bucket(bucket_prefix, s3_connection):
    session = boto3.session.Session()
    current_region = session.region_name
    bucket_name = create_bucket_name(bucket_prefix)
    bucket_response = s3_connection.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={
        'LocationConstraint': current_region})
    print(bucket_name, current_region)
    return bucket_name, bucket_response

In [7]:
bucket_name, bucket_response = create_bucket(bucket_prefix='censusdata-', s3_connection=s3_resource)

censusdata-199ca23a-006d-4a26-ae62-0c41c0d56db5 us-east-2


In [28]:
bucket = list(s3_resource.buckets.all())[0]

In [29]:
bucket

s3.Bucket(name='censusdata-199ca23a-006d-4a26-ae62-0c41c0d56db5')

# Create Files

In [9]:
def generate_random_filename(filename):
    return ''.join([str(uuid.uuid4().hex[:6]), f'_{filename}'])

def get_census_excel_file(url):
    df = pd.read_excel(url)
    filename = url.split('/')[-1].split('.')[-2]
    return df, filename

def write_census_excel_file(filename, df):
    random_file_name = generate_random_filename(filename)
    filepath = f'{data_path}{random_file_name}.csv'
    df.to_csv(filepath)
    
def rename_existing_files(filepath):
    filename = filepath.split('/')[-1].split('.')[-2]
    filedir = os.path.join(*filepath.split('/')[:-1])
    random_file_name = generate_random_filename(filename)
    os.rename(filepath, f'{filedir}/{random_file_name}.csv')

In [34]:
census_summaryfiles_excel = [
    "https://www2.census.gov/programs-surveys/acs/tech_docs/table_shells/table_lists/2018_DataProductList.xlsx?#",
    "https://www2.census.gov/programs-surveys/popest/geographies/2018/all-geocodes-v2018.xlsx"
    ]
census_summaryfiles_zip = [
    'https://www2.census.gov/programs-surveys/acs/summary_file/2018/data/2018_5yr_Summary_FileTemplates.zip?#'
]

In [12]:
# for file in os.listdir('data/raw/census_description/'):
#     if file.endswith('.csv'):
#         rename_existing_files(f'{data_path}{file}')

In [13]:
# for census_summaryfile in census_summaryfiles_excel:
#     df, filename = get_census_excel_file(census_summaryfile)
#     write_census_excel_file(filename, df)

In [21]:
csv_files_for_bucket = [f'{data_path}{file}' for file in os.listdir(data_path) if file.endswith('.csv')]
csv_files_for_bucket

TypeError: listdir: path should be string, bytes, os.PathLike, integer or None, not Paths

# Add Files

In [15]:
for file in csv_files_for_bucket:
    filename = file
    key = '/'.join(file.split('/')[-2:])
    s3_resource.Bucket(bucket_name).upload_file(Filename=filename, Key=key)

# Downloading a File

In [16]:
file = csv_files_for_bucket[0]
key = '/'.join(file.split('/')[-2:])
s3_resource.Object(bucket_name, key).download_file(
    f"/tmp/{key.split('/')[-1]}")

In [17]:
os.path.exists(f"/tmp/{key.split('/')[-1]}")

True

# Skipped Advanced Configurations

#### ACL (Access Control Lists) - Permissions
#### Encryption
#### Storage Type

# Versioning

In [18]:
def enable_bucket_versioning(bucket_name):
    bkt_versioning = s3_resource.BucketVersioning(bucket_name)
    bkt_versioning.enable()
    print(bkt_versioning.status)

In [19]:
enable_bucket_versioning(bucket_name)

Enabled


In [20]:
for obj in bucket.objects.all():
    print(obj.Object().version_id)

null
null
null


# Traversals

## Bucket Traversal

In [21]:
for bucket in s3_resource.buckets.all():
    print(bucket)

s3.Bucket(name='censusdata-199ca23a-006d-4a26-ae62-0c41c0d56db5')
s3.Bucket(name='censusdata-63c35a0b-80e5-4488-b1fb-5f612dd4ac65')


In [22]:
for bucket_dict in s3_resource.meta.client.list_buckets().get('Buckets'):
    print(bucket_dict)

{'Name': 'censusdata-199ca23a-006d-4a26-ae62-0c41c0d56db5', 'CreationDate': datetime.datetime(2020, 5, 14, 21, 22, 48, tzinfo=tzutc())}
{'Name': 'censusdata-63c35a0b-80e5-4488-b1fb-5f612dd4ac65', 'CreationDate': datetime.datetime(2020, 5, 14, 21, 21, 45, tzinfo=tzutc())}


## Object Traversal

In [1]:
for obj in bucket.objects.all():
    print(obj.key)

NameError: name 'bucket' is not defined

# Deleting Buckets and Objects

In [24]:
def delete_all_objects(bucket_name):
    res = []
    bucket=s3_resource.Bucket(bucket_name)
    for obj_version in bucket.object_versions.all():
        res.append({'Key': obj_version.object_key,
                    'VersionId': obj_version.id})
    print(res)
    bucket.delete_objects(Delete={'Objects': res})

In [25]:
# delete_all_objects(bucket_name)

In [26]:
# bucket.delete()

# Adding files with versioning 

In [27]:
delete_all_objects(bucket_name)

[{'Key': 'census_description/365817_all-geocodes-v2018.csv', 'VersionId': 'null'}, {'Key': 'census_description/3a7875_2018_1yr_table_summary.csv', 'VersionId': 'null'}, {'Key': 'census_description/da6c02_2018_DataProductList.csv', 'VersionId': 'null'}]


In [28]:
for file in csv_files_for_bucket:
    filename = file
    key = '/'.join(file.split('/')[-2:])
    s3_resource.Bucket(bucket_name).upload_file(Filename=filename, Key=key)

In [29]:
for obj in bucket.objects.all():
    print(obj.Object().version_id)