In [2]:
# %pip install boto3
import pip
import itertools
import pandas as pd
import os

class StopExecution(Exception):
    def _render_traceback_(self):
        pass


def import_or_install(package):
    try:
        __import__(package)
        print(f'{package} is already installed')
    except ImportError:
        print(f'Installing {package}')
        pip.main(['install', package])


import_or_install("boto3")
import boto3

aws_access_key_id = ""
aws_secret_access_key = ""

boto3 is already installed


In [9]:
bucket_name = 'beam-outputs'



s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id,
                  aws_secret_access_key=aws_secret_access_key) if aws_access_key_id else boto3.client('s3')
paginator = s3.get_paginator('list_objects_v2')

from dataclasses import dataclass
from datetime import datetime


@dataclass
class PrefixContent:
    prefix: str
    objects: []
    folders: []
    last_modified: datetime
    storage_class: str


def find_content(prefix: str):
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter="/")
    objects = []
    folders = []
    last_modified = None
    storage_class = "STANDARD"
    for page in pages:
        if "Contents" in page:
            for obj in page["Contents"]:
                objects.append(obj["Key"])
                if not last_modified:
                    last_modified = obj["LastModified"]
                if obj["StorageClass"] != "STANDARD":
                    storage_class = obj["StorageClass"]
        if "CommonPrefixes" in page:
            for obj in page["CommonPrefixes"]:
                folders.append(obj["Prefix"])

    return PrefixContent(prefix, objects, folders, last_modified, storage_class)


In [12]:
# Searches for beam folders on S3 bucket and saves path/last_modified/size to a csv file (local_files dir)
# using low level API to get only the first level of subfolders
search_prefix = 'pilates-outputs/sfbay-'

from dataclasses import dataclass
from datetime import datetime

@dataclass
class BeamFolder:
    path: str
    last_modified: datetime
    storage_class: str


def find_beam_folders(prefix: str):
    content = find_content(prefix)
    if any(x for x in content.folders if x.endswith("/ITERS/")) | any(
            x for x in content.objects if x.endswith("/beamLog.out")):
        beam_folder = BeamFolder(content.prefix, content.last_modified, content.storage_class)
        print(beam_folder)
        return [beam_folder]
    else:
        return [x for folder in content.folders for x in find_beam_folders(folder)]


beam_folders = find_beam_folders(search_prefix)

result = [[x.path, x.last_modified, x.storage_class] for x in beam_folders]

df = pd.DataFrame(result, columns=["path", "date", "storage_class"])
file_name = search_prefix.split('/')[-1] + ".csv"
docker_path = "/home/jovyan/local_files"
dir_to_save = docker_path if os.path.isdir(docker_path) else "../local_files"

display(df)
df.to_csv(os.path.join(dir_to_save, file_name), index=False)


# pages = paginator.paginate(Bucket=bucket_name, Prefix=search_prefix, Delimiter="/")
# for page in pages:
#     print("Sub-folders:", list(obj["Prefix"] for obj in page["CommonPrefixes"]))
#     print("Objects:", list(obj["Key"] for obj in page["Contents"]))

BeamFolder(path='pilates-outputs/sfbay-base-20220409/beam/year-2018-iteration-0/', last_modified=datetime.datetime(2022, 4, 9, 21, 8, 44, tzinfo=tzutc()), storage_class='STANDARD')
BeamFolder(path='pilates-outputs/sfbay-base-20220409/beam/year-2018-iteration-1/', last_modified=datetime.datetime(2022, 4, 9, 21, 9, 1, tzinfo=tzutc()), storage_class='STANDARD')
BeamFolder(path='pilates-outputs/sfbay-base-20220409/beam/year-2018-iteration-2/', last_modified=datetime.datetime(2022, 4, 9, 21, 9, 15, tzinfo=tzutc()), storage_class='STANDARD')
BeamFolder(path='pilates-outputs/sfbay-base-20220409/beam/year-2018-iteration-3/', last_modified=datetime.datetime(2022, 4, 9, 21, 9, 30, tzinfo=tzutc()), storage_class='STANDARD')
BeamFolder(path='pilates-outputs/sfbay-base-20220409/beam/year-2018-iteration-4/', last_modified=datetime.datetime(2022, 4, 9, 21, 9, 45, tzinfo=tzutc()), storage_class='STANDARD')
BeamFolder(path='pilates-outputs/sfbay-base-20220409/beam/year-2018-iteration-5/', last_modified

Unnamed: 0,path,date,storage_class
0,pilates-outputs/sfbay-base-20220409/beam/year-...,2022-04-09 21:08:44+00:00,STANDARD
1,pilates-outputs/sfbay-base-20220409/beam/year-...,2022-04-09 21:09:01+00:00,STANDARD
2,pilates-outputs/sfbay-base-20220409/beam/year-...,2022-04-09 21:09:15+00:00,STANDARD
3,pilates-outputs/sfbay-base-20220409/beam/year-...,2022-04-09 21:09:30+00:00,STANDARD
4,pilates-outputs/sfbay-base-20220409/beam/year-...,2022-04-09 21:09:45+00:00,STANDARD
...,...,...,...
408,pilates-outputs/sfbay-transit_frequencies_2.0-...,2022-06-13 22:13:51+00:00,STANDARD
409,pilates-outputs/sfbay-transit_frequencies_2.0-...,2022-06-13 22:14:43+00:00,STANDARD
410,pilates-outputs/sfbay-transit_frequencies_2.0-...,2022-06-13 22:15:38+00:00,STANDARD
411,pilates-outputs/sfbay-transit_frequencies_2.0-...,2022-06-13 22:16:32+00:00,STANDARD


In [23]:
# for a list of aws folders
# it finds a file within a folder and get lastModified attribute
input = "beam_output_url"
input_folders = pd.read_csv("../local_files/%s.csv" % input, names=['path', 'date', 'url'], parse_dates=['date'])
# input_folders['date'] = input_folders['date'].dt.tz_convert("UTC")

def get_last_modified(prefix: str):
    print(prefix)
    content = find_content(prefix)
    if content.last_modified:
        return content.last_modified
    elif len(content.folders) > 0:
        return get_last_modified(content.folders[0])
    else:
        return None

mask = input_folders['date'].isnull()

input_folders.loc[mask, 'date'] = input_folders.loc[mask]['path'].apply(get_last_modified)
# input_folders['url'] = "https://s3.us-east-2.amazonaws.com/beam-outputs/index.html#" + input_folders['path']
input_folders.to_csv("../local_files/%s_last_modified.csv" % input, index=False, header=False)
input_folders

archive/root/a_lt_30_hours/
archive/root/a_lt_30_hours_with_vehicle_retirement_update/
archive/root/afi-none-200mi-50kw-a-hightech-BEV20__2019-09-29_20-27-52_i-00b970dfb14e45e04/
archive/root/afi-rich5-300mi-50kw-a-hightech-BEV20__2019-09-29_20-27-52_i-08290d5cb315532c3/
archive/root/afi-sparse-100mi-150kw-a-hightech__2019-09-25_20-15-58_i-045f3ec9d47c1d052/
archive/root/afi-sparse-100mi-150kw-b-lowtech__2019-09-07_02-13-54_i-0f57ef32119a9c4e2/
archive/root/afi-sparse-100mi-150kw-b-lowtech__2019-09-09_19-25-22_i-0b0064d1b1080ff15/
archive/root/afi-sparse-100mi-150kw-b-lowtech__2019-09-10_05-49-06_i-012f2d0e9e6c1d3a7/
archive/root/afi-sparse-100mi-150kw-b-lowtech__2019-09-10_13-40-53_i-0074cf00189869446/
archive/root/afi-sparse-100mi-50kw-a-hightech__2019-09-21_20-36-13_i-0c40d2c9408912812/
archive/root/afi-sparse-100mi-50kw-a-hightech__2019-09-25_20-15-57_i-0478af8951cc00f2f/
archive/root/afi-sparse-100mi-50kw-b-lowtech__2019-09-21_20-36-13_i-0530e61acccb31f3f/
archive/root/afi-sparse-

Unnamed: 0,path,date,url
0,archive/root/a_lt_30_hours/,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
1,archive/root/a_lt_30_hours_with_vehicle_retire...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
2,archive/root/afi-none-200mi-50kw-a-hightech-BE...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
3,archive/root/afi-rich5-300mi-50kw-a-hightech-B...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
4,archive/root/afi-sparse-100mi-150kw-a-hightech...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
...,...,...,...
4714,output/sfbay/smart-baseline-45k__2021-12-15_16...,2021-12-15 16:51:30+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4715,output/sfbay/smart-baseline-45k__2021-12-15_20...,2021-12-15 20:30:39+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4716,output/sfbay/smart-baseline-45k__2021-12-15_20...,2021-12-15 20:30:56+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4717,output/sfbay/smart-baseline-45k__2021-12-16_13...,2021-12-16 14:04:01+00:00,https://s3.us-east-2.amazonaws.com/beam-output...


In [5]:
# concat 3 data files and make a list for deep archiving;
# everything before 2023 and not in the keep list
to_keep = pd.read_csv("../local_files/to_keep.csv")
to_keep_t = tuple(to_keep['path'].tolist())

beam_output = pd.read_csv("../local_files/beam_output_url.csv", names=['path', 'date', 'url'], parse_dates=['date'])
pilates_output = pd.read_csv("../local_files/pilates_output_url_last_modified.csv", names=['path', 'date', 'url'], parse_dates=['date'])
others = pd.read_csv("../local_files/other_data_last_modified.csv", names=['path', 'size', 'date', 'url'], parse_dates=['date'])
others = others.drop(columns=['size'])
all = pd.concat([beam_output, pilates_output, others])
display(len(all))

display(all[all['path'] == "output/austin/austin-200k-gh-car-only__2020-09-11_13-40-41_slo/"])

time_thres = pd.Timestamp("2023-01-01 00:00:00+00:00")

archive = all[(all['date'] < time_thres)].copy()
display(len(archive))


archive['to_keep'] = archive.apply(lambda row: row['path'].startswith(to_keep_t), axis=1)
archive = archive[archive['to_keep'] == False].drop(columns="to_keep")
display(len(archive))
display(archive[archive['path'] == "output/austin/austin-200k-gh-car-only__2020-09-11_13-40-41_slo/"])
# display(archive)
# archive.drop(columns=['date', 'url']).to_csv("../local_files/archive.csv", index=False, header=False)

6069

Unnamed: 0,path,date,url
159,output/austin/austin-200k-gh-car-only__2020-09...,2020-09-11 14:20:55+00:00,https://s3.us-east-2.amazonaws.com/beam-output...


5552

5462

Unnamed: 0,path,date,url
159,output/austin/austin-200k-gh-car-only__2020-09...,2020-09-11 14:20:55+00:00,https://s3.us-east-2.amazonaws.com/beam-output...


In [3]:
# move s3 folders (read from a csv file) to some location within the same bucket
bucket_name = 'beam-outputs'
destination = "archive/root"
file = "../local_files/test_moved.csv"

import multiprocessing

if destination.endswith("/"): destination = destination[:-1]

paths = pd.read_csv(file)['path'].tolist()

print(f"Moving {paths} to {destination}")

s3 = boto3.resource('s3', aws_access_key_id=aws_access_key_id,
                    aws_secret_access_key=aws_secret_access_key) if aws_access_key_id else boto3.resource('s3')

not_to_delete = pd.read_csv("../local_files/not_to_delete.csv")['path'].tolist()

for path in paths:
    path = path.strip()
    if path.endswith("/"): path = path[:-1]
    if path == "": continue
    if any(x for x in not_to_delete if path.startswith(x)):
        print(f"NOT DELETE {path}")
        continue
    print(f"Moving {path} to {destination}")

    last_index = path.rfind('/')
    outer_folder = path[0:last_index] if last_index >= 0 else ""
    print(outer_folder)

    def move_obj(obj_key):
        copy_source = {'Bucket': bucket_name, 'Key': obj_key}
        new_folder = obj_key[len(outer_folder):]
        if  not new_folder.startswith("/"):
            new_folder = "/" + new_folder
        new_key = destination + new_folder
        # print(new_key)
        s3.meta.client.copy(copy_source, bucket_name, new_key)
        s3.meta.client.delete_object(Bucket=bucket_name, Key=obj_key)

    bucket = s3.Bucket(bucket_name)
    object_keys = [obj.key for obj in bucket.objects.filter(Prefix=path)]
    with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
        p.map(move_obj, object_keys)

    print(f"Moved {path}")

print(f"Done")


Moving ['sfbay-smart-base__2019-05-22_22-09-53'] to archive/root
Moving sfbay-smart-base__2019-05-22_22-09-53 to archive/root

Moved sfbay-smart-base__2019-05-22_22-09-53
Done


In [62]:
# keep data that is after 2022 and the data that is in to_keep.csv

to_keep = pd.read_csv("../local_files/to_keep.csv")
to_keep_t = tuple(to_keep['path'].tolist())
input = "other_data"
all_data = pd.read_csv("../local_files/%s.csv" % input, names=['path', 'size', 'date'], parse_dates=['date'])
# display(all_data)
time_thres = pd.Timestamp("2022-01-01 00:00:00+00:00")

tbd = all_data[(all_data['date'] < time_thres)
                | (all_data['date'].isna() & all_data['path'].str.contains('20((15)|(16)|(17)|(18)|(19)|(20)|(21))'))
                | (all_data['path'].str.startswith('archive'))].copy()
# display(tbd)


tbd['to_keep'] = tbd.apply(lambda row: row['path'].startswith(to_keep_t), axis=1)
tbd = tbd[tbd['to_keep'] == False].drop(columns="to_keep")

tbd['url'] = "https://s3.us-east-2.amazonaws.com/beam-outputs/index.html#" + tbd['path']

tbd.to_csv("../local_files/%s_tbd.csv" % input, index=False, header=False)
tbd

  | (all_data['date'].isna() & all_data['path'].str.contains('20((15)|(16)|(17)|(18)|(19)|(20)|(21))'))


Unnamed: 0,path,size,date,url
0,*_31fd8dfd.zip,6.06 kB,2018-05-03 23:46:23+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
1,*_a74f8bce.zip,104.23 kB,2019-03-27 00:43:58+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
2,*_c55cf7ae.zip,6.05 kB,2018-05-03 21:06:15+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
3,*_fc394242.zip,104.24 kB,2019-03-27 00:43:47+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
5,/base_2018-01-19_21-23-36.tar.gz,5.01 GB,2018-01-20 13:24:23+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
...,...,...,...,...
943,pilates-outputs/sfbay-skims/result_skims-sfbay...,85.03 MB,2021-11-17 02:16:04+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
944,pilates-outputs/sfbay-skims/result_skims-sfbay...,188.25 MB,2021-11-17 02:16:04+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
945,pilates-outputs/sfbay-skims/result_skims-sfbay...,67.96 MB,2021-11-17 02:16:04+03:00,https://s3.us-east-2.amazonaws.com/beam-output...
946,pilates-outputs/sfbay-skims/result_skims-sfbay...,61.96 MB,2021-11-17 02:16:04+03:00,https://s3.us-east-2.amazonaws.com/beam-output...


In [66]:
# Add url column

input = "beam_output"
all_data = pd.read_csv("../local_files/%s.csv" % input, names=['path', 'date'], parse_dates=['date'])

all_data['url'] = "https://s3.us-east-2.amazonaws.com/beam-outputs/index.html#" + all_data['path']

all_data.to_csv("../local_files/%s_url.csv" % input, index=False, header=False)
all_data

Unnamed: 0,path,date,url
0,archive/root/a_lt_30_hours/,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
1,archive/root/a_lt_30_hours_with_vehicle_retire...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
2,archive/root/afi-none-200mi-50kw-a-hightech-BE...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
3,archive/root/afi-rich5-300mi-50kw-a-hightech-B...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
4,archive/root/afi-sparse-100mi-150kw-a-hightech...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
...,...,...,...
4714,output/sfbay/smart-baseline-45k__2021-12-15_16...,2021-12-15 16:51:30+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4715,output/sfbay/smart-baseline-45k__2021-12-15_20...,2021-12-15 20:30:39+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4716,output/sfbay/smart-baseline-45k__2021-12-15_20...,2021-12-15 20:30:56+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
4717,output/sfbay/smart-baseline-45k__2021-12-16_13...,2021-12-16 14:04:01+00:00,https://s3.us-east-2.amazonaws.com/beam-output...


In [8]:
# check if all the data is safe to delete

safe_to_del = pd.read_csv("../local_files/latest_beam_output_tbd.csv", names=['path', 'date', 'url'], parse_dates=['date'])

input = "atd_beam"
all_data = pd.read_csv("../local_files/%s.csv" % input, names=['path'])

all_data[~all_data['path'].isin(safe_to_del['path'])]

Unnamed: 0,path
6,output/beamville/beamville-urbansimv2_input__2...
7,output/beamville/beamville-urbansimv2_input__2...
8,output/beamville/beamville-urbansimv2_input__2...
15,output/beamville/beamville-xml__2019-10-20_13-...
16,output/beamville/beamville-xml__2019-10-20_17-...
...,...
1042,output/sfbay/sfbay-smart-c-ht-2040__2019-09-20...
1043,output/sfbay/sfbay-smart-c-ht-2040__2019-10-28...
1052,output/sfbay/sfbay-smart-c-lt-2010__2019-09-20...
1053,output/sfbay/sfbay-smart-c-lt-2040__2019-09-20...


In [7]:
# check if all the data is safe to delete

to_keep = pd.read_csv("../local_files/beam_del_exclude.csv", names=['path'])

input = "atd_beam"
all_data = pd.read_csv("../local_files/%s.csv" % input, names=['path'])

tbd = all_data[~all_data['path'].isin(to_keep['path'])]
tbd.to_csv("../local_files/approved_to_delete.csv", index=False, header=False)
tbd

Unnamed: 0,path
0,output/austin/speed_calibration/austin-prod-20...
1,output/austin/speed_calibration/austin-prod-20...
2,output/austin/speed_calibration/austin-prod-20...
3,output/austin/speed_calibration/austin-prod-20...
4,output/austin/speed_calibration/austin-prod-20...
...,...
1079,output/sfbay/sfcompt-calib-1__2019-07-18_21-03...
1080,output/sfbay/sfcompt-calib-lwt__2019-07-18_21-...
1081,output/sfbay/sfcompt-calib-lwt__2019-07-20_13-...
1082,output/sfbay/sfcompt-calib__2019-07-20_16-23-51/


In [10]:
# remove deleted entries
beam_out_tbd = pd.read_csv("../local_files/latest_beam_output_tbd.csv", names=['path', 'date', 'url'], parse_dates=['date'])
atd = pd.read_csv("../local_files/approved_to_delete.csv", names=['path'])

cleaned_removed_entries = beam_out_tbd[~beam_out_tbd['path'].isin(atd['path'])]
cleaned_removed_entries.to_csv("../local_files/beam_output_tbd_cleaned.csv", index=False, header=False)
cleaned_removed_entries


Unnamed: 0,path,date,url
0,archive/root/a_lt_30_hours/,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
1,archive/root/a_lt_30_hours_with_vehicle_retire...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
2,archive/root/afi-none-200mi-50kw-a-hightech-BE...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
3,archive/root/afi-rich5-300mi-50kw-a-hightech-B...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
4,archive/root/afi-sparse-100mi-150kw-a-hightech...,NaT,https://s3.us-east-2.amazonaws.com/beam-output...
...,...,...,...
2291,output/sfbay/smart-baseline-45k__2021-12-15_16...,2021-12-15 16:51:30+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
2292,output/sfbay/smart-baseline-45k__2021-12-15_20...,2021-12-15 20:30:39+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
2293,output/sfbay/smart-baseline-45k__2021-12-15_20...,2021-12-15 20:30:56+00:00,https://s3.us-east-2.amazonaws.com/beam-output...
2294,output/sfbay/smart-baseline-45k__2021-12-16_13...,2021-12-16 14:04:01+00:00,https://s3.us-east-2.amazonaws.com/beam-output...


In [4]:
# filter out big entries
big_entries = pd.read_csv("../local_files/big_entries.csv", names=['path'])
to_archive = pd.read_csv("../local_files/prod_archive.csv", names=['path'])

big_entries_to_delete = big_entries[big_entries['path'].isin(to_archive['path'])]
to_archive_small_entries = to_archive[~to_archive['path'].isin(big_entries_to_delete['path'])]
to_archive_small_entries.to_csv("../local_files/to_archive_small_entries.csv", index=False, header=False)
big_entries_to_delete.to_csv("../local_files/big_entries_to_delete.csv", index=False, header=False)
