In [7]:
# %pip install boto3
import pip
import itertools
import pandas as pd
import os

class StopExecution(Exception):
    def _render_traceback_(self):
        pass


def import_or_install(package):
    try:
        __import__(package)
        print(f'{package} is already installed')
    except ImportError:
        print(f'Installing {package}')
        pip.main(['install', package])


import_or_install("boto3")
import boto3

aws_access_key_id = ""
aws_secret_access_key = ""

boto3 is already installed


In [6]:
# Searches for beam folders on S3 bucket and saves path/last_modified/size to a csv file (local_files dir)
# using low level API to get only the first level of subfolders
bucket_name = 'beam-outputs'
search_prefix = 'back'

s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id,
                    aws_secret_access_key=aws_secret_access_key) if aws_access_key_id else boto3.client('s3')
paginator = s3.get_paginator('list_objects_v2')

from dataclasses import dataclass
from datetime import datetime


@dataclass
class PrefixContent:
    prefix: str
    objects: []
    folders: []
    last_modified: datetime
@dataclass
class BeamFolder:
    path: str
    last_modified: datetime


def find_content(prefix: str):
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter="/")
    objects = []
    folders = []
    last_modified = None
    for page in pages:
        if "Contents" in page:
            for obj in page["Contents"]:
                objects.append(obj["Key"])
                if not last_modified:
                    last_modified = obj["LastModified"]
        if "CommonPrefixes" in page:
            for obj in page["CommonPrefixes"]:
                folders.append(obj["Prefix"])

    return PrefixContent(prefix, objects, folders, last_modified)


def find_beam_folders(prefix: str):
    content = find_content(prefix)
    if any(x for x in content.folders if x.endswith("/ITERS/")) | any(
            x for x in content.objects if x.endswith("/beamLog.out")):
        beam_folder = BeamFolder(content.prefix, content.last_modified)
        print(beam_folder)
        return [beam_folder]
    else:
        return [x for folder in content.folders for x in find_beam_folders(folder)]


beam_folders = find_beam_folders(search_prefix)

result = [[x.path, x.last_modified] for x in beam_folders]

df = pd.DataFrame(result, columns=["path", "date"])
file_name = search_prefix.split('/')[-1] + ".csv"
docker_path = "/home/jovyan/local_files"
dir_to_save = docker_path if os.path.isdir(docker_path) else "../local_files"

display(df)
df.to_csv(os.path.join(dir_to_save, file_name), index=False)


# pages = paginator.paginate(Bucket=bucket_name, Prefix=search_prefix, Delimiter="/")
# for page in pages:
#     print("Sub-folders:", list(obj["Prefix"] for obj in page["CommonPrefixes"]))
#     print("Objects:", list(obj["Key"] for obj in page["Contents"]))

BeamFolder(path='backup-11-2019_us-east-1/haitam_HL-sblt-pooling-high/smart-b-lowtech-pooling/sblt-pooling-high1__2019-08-16_19-16-05/', last_modified=datetime.datetime(2019, 11, 29, 21, 14, 15, tzinfo=tzutc()))
BeamFolder(path='backup-11-2019_us-east-1/haitam_HL-sblt-pooling-high/smart-b-lowtech-pooling/sblt-pooling-high1__2019-08-19_08-38-04/', last_modified=datetime.datetime(2019, 11, 29, 20, 17, 47, tzinfo=tzutc()))
BeamFolder(path='backup-11-2019_us-east-1/haitam_HL-sblt-pooling-high/smart-b-lowtech-pooling/sblt-pooling-high1__2019-08-19_09-53-01/', last_modified=datetime.datetime(2019, 11, 29, 21, 0, 39, tzinfo=tzutc()))
BeamFolder(path='backup-11-2019_us-east-1/haitam_HL-sblt-pooling-high/smart-b-lowtech-pooling/sblt-pooling-high1__2019-08-19_20-11-43/', last_modified=datetime.datetime(2019, 11, 29, 21, 17, 1, tzinfo=tzutc()))
BeamFolder(path='backup-11-2019_us-east-1/haitam_HL-sblt-pooling-high/smart-b-lowtech-pooling/sblt-pooling-high1__2019-08-19_23-44-04/', last_modified=dat

Unnamed: 0,path,date
0,backup-11-2019_us-east-1/haitam_HL-sblt-poolin...,2019-11-29 21:14:15+00:00
1,backup-11-2019_us-east-1/haitam_HL-sblt-poolin...,2019-11-29 20:17:47+00:00
2,backup-11-2019_us-east-1/haitam_HL-sblt-poolin...,2019-11-29 21:00:39+00:00
3,backup-11-2019_us-east-1/haitam_HL-sblt-poolin...,2019-11-29 21:17:01+00:00
4,backup-11-2019_us-east-1/haitam_HL-sblt-poolin...,2019-11-29 20:21:28+00:00
...,...,...
62,backup-11-2019_us-east-1/sfbay-smart-c-lt__201...,2019-11-28 10:46:12+00:00
63,backup-11-2019_us-east-1/sfbay-smart-c-lt__201...,2019-11-28 11:02:15+00:00
64,backup-11-2019_us-east-1/sfbay-smart-c-lt__201...,2019-11-28 11:07:40+00:00
65,backup-11-2019_us-east-1/sfbay-smart-c-lt__201...,2019-11-28 10:28:46+00:00


In [3]:
# move s3 folders (read from a csv file) to some location within the same bucket
bucket_name = 'beam-outputs'
destination = "archive/root"
file = "../local_files/test_moved.csv"

import multiprocessing

if destination.endswith("/"): destination = destination[:-1]

paths = pd.read_csv(file)['path'].tolist()

print(f"Moving {paths} to {destination}")

s3 = boto3.resource('s3', aws_access_key_id=aws_access_key_id,
                    aws_secret_access_key=aws_secret_access_key) if aws_access_key_id else boto3.resource('s3')

not_to_delete = pd.read_csv("../local_files/not_to_delete.csv")['path'].tolist()

for path in paths:
    path = path.strip()
    if path.endswith("/"): path = path[:-1]
    if path == "": continue
    if any(x for x in not_to_delete if path.startswith(x)):
        print(f"NOT DELETE {path}")
        continue
    print(f"Moving {path} to {destination}")

    last_index = path.rfind('/')
    outer_folder = path[0:last_index] if last_index >= 0 else ""
    print(outer_folder)

    def move_obj(obj_key):
        copy_source = {'Bucket': bucket_name, 'Key': obj_key}
        new_folder = obj_key[len(outer_folder):]
        if  not new_folder.startswith("/"):
            new_folder = "/" + new_folder
        new_key = destination + new_folder
        # print(new_key)
        s3.meta.client.copy(copy_source, bucket_name, new_key)
        s3.meta.client.delete_object(Bucket=bucket_name, Key=obj_key)

    bucket = s3.Bucket(bucket_name)
    object_keys = [obj.key for obj in bucket.objects.filter(Prefix=path)]
    with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
        p.map(move_obj, object_keys)

    print(f"Moved {path}")

print(f"Done")


Moving ['sfbay-smart-base__2019-05-22_22-09-53'] to archive/root
Moving sfbay-smart-base__2019-05-22_22-09-53 to archive/root

Moved sfbay-smart-base__2019-05-22_22-09-53
Done


In [62]:
# keep data that is after 2022 and the data that is in to_keep.csv

to_keep = pd.read_csv("../local_files/to_keep.csv")
to_keep_t = tuple(to_keep['path'].tolist())
input = "other_data"
all_data = pd.read_csv("../local_files/%s.csv" % input, names=['path', 'size', 'date'], parse_dates=['date'])
# display(all_data)
time_thres = pd.Timestamp("2022-01-01 00:00:00+00:00")

tbd = all_data[(all_data['date'] < time_thres)
                | (all_data['date'].isna() & all_data['path'].str.contains('20((15)|(16)|(17)|(18)|(19)|(20)|(21))'))
                | (all_data['path'].str.startswith('archive'))].copy()
# display(tbd)


tbd['to_keep'] = tbd.apply(lambda row: row['path'].startswith(to_keep_t), axis=1)
tbd = tbd[tbd['to_keep'] == False].drop(columns="to_keep")

tbd['url'] = "https://s3.us-east-2.amazonaws.com/beam-outputs/index.html#" + tbd['path']

tbd.to_csv("../local_files/%s_tbd.csv" % input, index=False, header=False)
tbd

  | (all_data['date'].isna() & all_data['path'].str.contains('20((15)|(16)|(17)|(18)|(19)|(20)|(21))'))


Unnamed: 0,path,size,date,url
0,*_31fd8dfd.zip,6.06 kB,2018-05-03 23:46:23+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
1,*_a74f8bce.zip,104.23 kB,2019-03-27 00:43:58+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
2,*_c55cf7ae.zip,6.05 kB,2018-05-03 21:06:15+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
3,*_fc394242.zip,104.24 kB,2019-03-27 00:43:47+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
5,/base_2018-01-19_21-23-36.tar.gz,5.01 GB,2018-01-20 13:24:23+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
...,...,...,...,...
943,pilates-outputs/sfbay-skims/result_skims-sfbay...,85.03 MB,2021-11-17 02:16:04+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
944,pilates-outputs/sfbay-skims/result_skims-sfbay...,188.25 MB,2021-11-17 02:16:04+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
945,pilates-outputs/sfbay-skims/result_skims-sfbay...,67.96 MB,2021-11-17 02:16:04+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
946,pilates-outputs/sfbay-skims/result_skims-sfbay...,61.96 MB,2021-11-17 02:16:04+03:00,https://s3.us-east-2.amazonaws.com/beam-output...


In [66]:
# Add url column

input = "beam_output"
all_data = pd.read_csv("../local_files/%s.csv" % input, names=['path', 'date'], parse_dates=['date'])

all_data['url'] = "https://s3.us-east-2.amazonaws.com/beam-outputs/index.html#" + all_data['path']

all_data.to_csv("../local_files/%s_url.csv" % input, index=False, header=False)
all_data

Unnamed: 0,path,date,url
0,archive/root/a_lt_30_hours/,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
1,archive/root/a_lt_30_hours_with_vehicle_retire...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
2,archive/root/afi-none-200mi-50kw-a-hightech-BE...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
3,archive/root/afi-rich5-300mi-50kw-a-hightech-B...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
4,archive/root/afi-sparse-100mi-150kw-a-hightech...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
...,...,...,...
4714,output/sfbay/smart-baseline-45k__2021-12-15_16...,2021-12-15 16:51:30+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4715,output/sfbay/smart-baseline-45k__2021-12-15_20...,2021-12-15 20:30:39+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4716,output/sfbay/smart-baseline-45k__2021-12-15_20...,2021-12-15 20:30:56+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4717,output/sfbay/smart-baseline-45k__2021-12-16_13...,2021-12-16 14:04:01+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
