# Move/rename files from S3
- This notebook is to be used to move files in s3 and can be used to move/rename files from a folder
- Use this notebook when there are a lot of files, saving time vs using WinSCP

In [41]:
import os
import boto3
from multiprocessing.pool import Pool

import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv('../../.env')

True

## Connect do S3

In [2]:
s3_client = boto3.client(
    's3',
    aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
)

BUCKET = 'ddpawsfs'
KEY = 'workspace/mkt-cli-solutions/'

# check if bucket exists
list_of_buckets = [r['Name'] for r in s3_client.list_buckets()['Buckets']]
if not BUCKET in list_of_buckets: raise Exception()

## check files to delete

In [19]:
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=BUCKET, Prefix=KEY+'margens_gas_power/telecontagens')

objects_to_copy = []
for page in tqdm(pages):
    for obj in page['Contents']:
        objects_to_copy.append(obj['Key'])

counter_uploaded = pd.DataFrame(objects_to_copy, columns=['keys'])
print(counter_uploaded.shape)
counter_uploaded.head()

245it [00:50,  4.90it/s]


(244967, 1)


Unnamed: 0,keys
0,workspace/mkt-cli-solutions/margens_gas_power/...
1,workspace/mkt-cli-solutions/margens_gas_power/...
2,workspace/mkt-cli-solutions/margens_gas_power/...
3,workspace/mkt-cli-solutions/margens_gas_power/...
4,workspace/mkt-cli-solutions/margens_gas_power/...


## single file
copy a single file in S3

In [20]:
old_key = counter_uploaded['keys'][0]
old_key

'workspace/mkt-cli-solutions/margens_gas_power/telecontagens/raw/year=2018/month=06/voltage=bte/173593PEPT0002000001059349TH_20180709_13.sgl'

In [38]:
new_key = old_key.replace('margens_gas_power/telecontagens', 'smart_meters_pt')
new_key

'workspace/mkt-cli-solutions/smart_meters_pt/raw/year=2018/month=06/voltage=bte/173593PEPT0002000001059349TH_20180709_13.sgl'

### copy file

In [22]:
copy_source = {'Bucket': BUCKET, 'Key': old_key}
s3_client.copy(copy_source, BUCKET, new_key)

### check if new file exists

In [23]:
try:
    s3_client.head_object(Bucket=BUCKET, Key=new_key)
except s3_client.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '404':
        print('Key does not exist !!!')
    else:
        raise

### delete file

## multi files using pagination
- when there are more than 1000 files to be deleted, using boto will require pagination because boto only lists 1000 files at the time
- boto3 delete function will also delete 1000 files each time

### copy files

In [49]:
files_to_copy = counter_uploaded['keys'].tolist()
print(len(files_to_copy))
files_to_copy[0]

244967


'workspace/mkt-cli-solutions/margens_gas_power/telecontagens/raw/year=2018/month=06/voltage=bte/173593PEPT0002000001059349TH_20180709_13.sgl'

In [51]:
def copy_file_in_s3(old_key):

    new_key = old_key.replace('margens_gas_power/telecontagens', 'smart_meters_pt')

    copy_source = {'Bucket': BUCKET, 'Key': old_key}
    s3_client.copy(copy_source, BUCKET, new_key)

In [53]:
with Pool() as p:
    with tqdm(total=len(files_to_copy), desc='copy files', ncols=100) as pbar:
        for _ in p.imap_unordered(copy_file_in_s3, files_to_copy):
            pbar.update()

copy files: 100%|███████████████████████████████████████████| 244967/244967 [43:36<00:00, 93.63it/s]


### check copy

In [54]:
for f in tqdm(files_to_copy, desc='check copy', ncols=100):
    try:
        s3_client.head_object(Bucket=BUCKET, Key=f.replace('margens_gas_power/telecontagens', 'smart_meters_pt'))
    except s3_client.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print('Key does not exist !!!')
        else:
            raise

check copy: 100%|█████████████████████████████████████████| 244967/244967 [1:10:55<00:00, 57.57it/s]


## delete files

In [55]:
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=BUCKET, Prefix=KEY+'margens_gas_power/telecontagens')

objects_to_delete = {'Objects': []}

for obj in tqdm(pages.search('Contents')):
    objects_to_delete['Objects'].append({'Key': obj['Key']})

    # flush once aws limit reached
    if len(objects_to_delete['Objects']) >= 1000:
        s3_client.delete_objects(Bucket=BUCKET, Delete=objects_to_delete)
        objects_to_delete = {'Objects': []}
        
if len(objects_to_delete['Objects']) < 1000:
    s3_client.delete_objects(Bucket=BUCKET, Delete=objects_to_delete)

244967it [05:27, 748.08it/s]
