In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [3]:
## define a method to check if beam log exists
import requests

def beam_log_exists(path):
    if(not isinstance(path, str)):
        return False
    s3url_fixed = path
    if path and ("#output/*/*" not in path):
        s3url_fixed = path.replace("s3.us-east-2.amazonaws.com/beam-outputs/index.html#output","beam-outputs.s3.amazonaws.com/output").strip()
    beam_log_path = f"{s3url_fixed}/beamLog.out"
    try:
        r = requests.head(beam_log_path)
        return r.status_code == requests.codes.ok
    except Exception:
        return False

# output_path = " https://s3.us-east-2.amazonaws.com/beam-outputs/index.html#output/sfbay/sfbay-smart-ba1se__2019-10-23_13-18-45"
# print(beam_log_exists(output_path))

In [4]:
## reading exported csv

csv_data = pd.read_csv("../local_files/BEAM Deploy Status and Run Data - BEAM Instances.csv", parse_dates=['Time'])

exist_rows = []
non_exist_rows = []

index = 0
for s3url in csv_data['S3 Url'].copy():
    index=index+1
    if index % 500 == 1:
        print("read index :" + str(index))
    # if index > 360:
    #     break

    if(not isinstance(s3url, str)):
        continue
    # if 'https://' not in s3url:
    #     s3url=csv_data['S3 output path'][index-1]
    # if 'https://' not in s3url:
    #     s3url=csv_data['Commit'][index-1]
    
    branch = csv_data['Branch'][index-1]
    time = csv_data['Time'][index-1]
    
    if 'https://' not in s3url:
        non_exist_rows.append((branch, time, s3url))
        continue

    #csv_data['Time'][index-1]
    if(beam_log_exists(s3url)):
        exist_rows.append((branch, time, s3url))
    else:
        non_exist_rows.append((branch, time, s3url))

print("found rows:" + str(len(exist_rows)))


read index :1
read index :501
read index :1001
read index :1501
read index :2001
read index :2501
read index :3001
read index :3501
read index :4001
read index :4501
read index :5001
read index :5501
read index :6001
read index :6501
read index :7001
read index :7501
read index :8001
read index :8501
read index :9001
read index :9501
read index :10001
read index :10501
read index :11001
read index :11501
read index :12001
read index :12501
read index :13001
read index :13501
read index :14001
read index :14501
read index :15001
read index :15501
read index :16001
found rows:4703


In [6]:
## sorting the rows which still in S3 bucket

# https://www.programiz.com/python-programming/methods/list/sort
exist_rows.sort(key=lambda x: x[0])


In [7]:
## generate output.csv
pd.DataFrame(exist_rows).to_csv("../local_files/preparation_to_AWS_storage_cleanup.csv", header=["branch", "time", "s3url"], index=None)

output_data = pd.read_csv("../local_files/preparation_to_AWS_storage_cleanup.csv")

print(output_data.head(10))


                               branch                 time                                              s3url
0  AK/#2624-merg-urbansim-with-austin  2020-05-16 13:18:09   https://s3.us-east-2.amazonaws.com/beam-outpu...
1  AK/#2624-merg-urbansim-with-austin  2020-05-16 19:57:54   https://s3.us-east-2.amazonaws.com/beam-outpu...
2  AK/#2624-merg-urbansim-with-austin  2020-05-16 23:36:04   https://s3.us-east-2.amazonaws.com/beam-outpu...
3  AK/#2624-merg-urbansim-with-austin  2020-05-17 21:51:58   https://s3.us-east-2.amazonaws.com/beam-outpu...
4  AK/#2624-merg-urbansim-with-austin  2020-05-21 17:15:34   https://s3.us-east-2.amazonaws.com/beam-outpu...
5  AK/#2624-merg-urbansim-with-austin  2020-05-21 18:26:04   https://s3.us-east-2.amazonaws.com/beam-outpu...
6  AK/#2624-merg-urbansim-with-austin  2020-05-31 07:47:18   https://s3.us-east-2.amazonaws.com/beam-outpu...
7  AK/#2624-merg-urbansim-with-austin  2020-05-31 18:56:55   https://s3.us-east-2.amazonaws.com/beam-outpu...
8  AK/#262