# Administration: Assess Inconsistant Data from oSparc database and S3



## Clone oSparc Repo and install python dependencies

FYI: Output is supressed here for better readability. To debug, remove the '%%capture' at the end of the lines.

In [None]:
#%%capture
!git clone https://github.com/ITISFoundation/osparc-simcore.git
!python3 -m pip install requests
!python3 -m pip install sqlalchemy
!python3 -m pip install psycopg2-binary
!python3 -m pip install boto3
!python3 -m pip install pandas
!python3 -m pip install tqdm
!python3 -m pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!cd osparc-simcore/packages/postgres-database && pip install .

## Config: Endpoints and Credentials

### Static configuration variables

In [None]:
import os
osparcURL = os.environ.get('MACHINE_FQDN')

### S3 config variables

In [None]:
sourceendpointurl = os.environ.get('S3_ENDPOINT')
sourcebucketname = os.environ.get('S3_BUCKET')
sourcebucketaccess = os.environ.get('S3_ACCESS_KEY')
sourcebucketsecret= os.environ.get('S3_SECRET_KEY')

## Begin: Main script
### Import python mocules

In [None]:
# Vanilla Python
import sys
import json
import copy
import importlib
import time
from pathlib import Path
from collections import Counter
from datetime import datetime
# S3
import warnings
import boto3
from botocore.client import Config
# pgSQL
import sqlalchemy as db
from sqlalchemy.orm import sessionmaker
import psycopg2
# Pandas and Widgets
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
# tqdm progressbar
from tqdm.notebook import tqdm
# Osparc-Simcore
import simcore_postgres_database
from simcore_postgres_database.models.projects import projects
from simcore_postgres_database.models.users import users
from simcore_postgres_database.models.file_meta_data import file_meta_data

## Connect to S3 bucket

In [None]:
# Connect S3
warnings.filterwarnings('ignore', '.*Adding certificate verification is strongly advised.*', )
############ Common functionality
def isObjectPresentOnBucket(botobucket, filepath,allObjectsBucket = None,filelist=None):
    if filelist != None:
        return filepath in filelist
    else:
        if allObjectsBucket == None:
            objs = list(botobucket.objects.filter(Prefix=filepath))
        else:
            objs = list(allObjectsBucket.filter(Prefix=filepath))
        if len(objs) == 1:
            return True
        else:
            return False
# Configure source bucket
# via https://docs.min.io/docs/how-to-use-aws-sdk-for-python-with-minio-server.html
src_s3 = boto3.resource('s3',
                    endpoint_url=sourceendpointurl,
                    aws_access_key_id=sourcebucketaccess,
                    aws_secret_access_key=sourcebucketsecret,
                    config=Config(signature_version='s3v4'),
                    region_name='us-east-1',
                    verify=False)
src_bucket = src_s3.Bucket(sourcebucketname)
filelist = None

### Fetch all files from S3 Bucket

In [None]:
# Get all files from S3, create filelist
print("WARNING: THIS MAKE TAKE A WHILE DEPENDING ON BUCKET SIZE")
print("Running...")
allObjectsSourceBucket = src_bucket.objects.all()
filelist = set({i.key for i in allObjectsSourceBucket})
# Total list of files on S3:
filesOnS3 = filelist
print("Done")

 Note bene: If you plan on mutating JSON fields, see this [https://amercader.net/blog/beware-of-json-fields-in-sqlalchemy ]

### Connect to pgSQL

In [None]:
PG_PASSWORD = os.environ.get('POSTGRES_PASSWORD')
PG_PUBLIC_ENDPOINT=os.environ.get('MACHINE_FQDN')
PG_DB=os.environ.get('POSTGRES_DB')
PG_PORT=os.environ.get('POSTGRES_PORT')
PG_USER=os.environ.get('POSTGRES_USER')
pgEngineURL= "postgresql://{user}:{password}@{host}:{port}/{database}".format(
        user=PG_USER,
        password=PG_PASSWORD,
        database=PG_DB,
        host=PG_PUBLIC_ENDPOINT,
        port=int(PG_PORT),
    )
engine = db.create_engine(pgEngineURL)
Session = sessionmaker(bind=engine)
session = Session()
metadata = db.MetaData()
####
# Get database tables as pandas df objects
users_df = pd.read_sql_table(
    'users',
    con=engine
)
projects_df = pd.read_sql_table(
    'projects',
    con=engine
)
files_meta_data_df = pd.read_sql_table(
    'file_meta_data',
    con=engine
)

### Iterate projects table, find files referenced in project/workbench/nodes

In [None]:
foundFilePathsInProjectsTable_WorkbenchInputs = []
foundInstances_WorkbenchInput = []
foundFilePathsInProjectsTable_WorkbenchOutputs = []
foundInstances_WorkbenchOutput = []
# I .Iterate projects table, iterate workbench of every project, find inputs/outputs that are of key 'path', whitelist them
for i,instance in enumerate(session.query(projects).order_by(projects.c.id)):
    listOfNodesInProject = instance.workbench.keys()
    for key in listOfNodesInProject:
        inputsOutputsPresent = False
        if 'inputs' in instance.workbench[key].keys() and instance.workbench[key]['inputs'] != {}:
            #print("Inputs: ",instance.workbench[key]['inputs'])
            inputsOutputsPresent = True
            for j in instance.workbench[key]['inputs'].keys():
                # Ducktype check if input obj is dict:
                try:
                    items = instance.workbench[key]['inputs'][j].items()
                except (AttributeError, TypeError):
                    continue
                else: # is dict
                    for k in instance.workbench[key]['inputs'][j].keys():
                        if k == 'path':
                            foundFilePathsInProjectsTable_WorkbenchInputs.append(instance.workbench[key]['inputs'][j][k])
                            foundInstances_WorkbenchInput.append(instance)
        if 'outputs' in instance.workbench[key].keys() and instance.workbench[key]['outputs'] != {}:
            #print("Outputs: ",instance.workbench[key]['outputs'])
            for j in instance.workbench[key]['outputs'].keys():
                # Ducktype check if input obj is dict:
                try:
                    items = instance.workbench[key]['outputs'][j].items()
                except (AttributeError, TypeError):
                    continue
                else: # is dict
                    for k in instance.workbench[key]['outputs'][j].keys():
                        if k == 'path':
                            foundFilePathsInProjectsTable_WorkbenchOutputs.append(instance.workbench[key]['outputs'][j][k])
                            foundInstances_WorkbenchOutput.append(instance)
            inputsOutputsPresent = True
# Find files we expect to be present, but they are not
garbageFilesInProjectsWorkbench = set(foundFilePathsInProjectsTable_WorkbenchInputs + foundFilePathsInProjectsTable_WorkbenchOutputs).difference( set(filesOnS3))
print("Number of files MISSING according to projects/workbench references: ",len(garbageFilesInProjectsWorkbench))

### Iterate file_meta_data table, resolve file location S3

In [None]:
session = Session()
statement = db.select([file_meta_data])
ResultProxy = session.execute(statement)
flag = True
file_meta_data_files = []
file_meta_data_files_NOT_ON_S3 = []
file_meta_data_files_filesizeMinusOne_ON_S3 = []
file_meta_data_files_filesizeMinusOne_NOT_ON_S3 = []
countFoundByUuid = 0
countFoundByObjectName = 0
countFoundByFileID = 0
countFoundByRawPath = 0
debugList = []
print("Running analysis for file_meta_data table, this can take a while...")
# We paginate, and fetch the table in a while loop piece-by-piece
# For every file, we check if it is actually present on the S3 bucket, and if the file_size attribute == -1.
### Healthy files: Present on S3 and filesize != -1
### Healable files: Present on S3 and filesize == -1
### Damaged files: Not present on S3
while flag:
    partial_results = ResultProxy.fetchmany(50)
    if partial_results == []:
        flag = False
        ResultProxy.close()
    else:
        for i,instance in enumerate(partial_results):
            if instance.is_soft_link == 1:
                if isObjectPresentOnBucket(src_bucket,instance.file_id,filelist=filelist):
                    file_meta_data_files.append(str(instance.file_id))
                else:
                    file_meta_data_files_NOT_ON_S3.append(str(instance.file_id))
            elif instance.file_size == -1:
                
                if isObjectPresentOnBucket(src_bucket,instance.object_name,filelist=filelist):
                    file_meta_data_files_filesizeMinusOne_ON_S3.append(str(instance.object_name))
                else:
                    file_meta_data_files_filesizeMinusOne_NOT_ON_S3.append(str(instance.object_name))
            else:
                if isObjectPresentOnBucket(src_bucket,instance.object_name,filelist=filelist):
                    file_meta_data_files.append(str(instance.object_name))
                else:
                    file_meta_data_files_NOT_ON_S3.append(str(instance.object_name))

### Disabled: Recover S3 files from versioning whereever possible

In [None]:
deletedFilesRecoverableOnS3UsingVersioning = []
if False:
    for i in tqdm(file_meta_data_files_NOT_ON_S3 + file_meta_data_files_filesizeMinusOne_NOT_ON_S3):
        versions = src_bucket.object_versions.filter(Prefix=i)
        for version in versions:
            curVersion = "null"
            try:
                curVersion = version.get().get('VersionId')
            except:
                pass
            if curVersion != "null":
                deletedFilesRecoverableOnS3UsingVersioning.append(i)
                break                    

## Print results

In [None]:
print("Done!")
print("Preliminary Summary:")
print("##########")
# File Size -1: File never finished uploading -> Can be removed after checking that they dont exist in S3
print("Number of files present on S3 but noted as corrupt in pgSQL (filesize == -1, healable): ", len(file_meta_data_files_filesizeMinusOne_ON_S3))
print("Number of files not present on S3 and noted as corrupt in pgSQL (filesize == -1): ", len(file_meta_data_files_filesizeMinusOne_NOT_ON_S3))
print("Number of MISSING files referenced in file_meta_data: ", len(file_meta_data_files_NOT_ON_S3))
print("Number of MISSING files recoverable using bucket versioning: ", len(deletedFilesRecoverableOnS3UsingVersioning))
print("##########")
print("Number of files healthy: ", len(file_meta_data_files))
print("Number of files damaged: ", len(file_meta_data_files_NOT_ON_S3 + file_meta_data_files_filesizeMinusOne_NOT_ON_S3))
print("Number of files healable: ", len(file_meta_data_files_filesizeMinusOne_ON_S3))

### Dig deeper into the properties of missing files & inconsistencies

In [None]:
# We know these files to be valid, and store the full path to the file in the whitelist
whitelistedFiles = list(set(file_meta_data_files + foundFilePathsInProjectsTable_WorkbenchInputs + foundFilePathsInProjectsTable_WorkbenchOutputs))
# Find files present on S3 but not referenced in the DB
garbageFilesOnS3 = set(filesOnS3 - set(whitelistedFiles))

In [None]:
print("Running additional analysis for file_meta_data table, this can take a while...")
# Get total list of files that are damaged (we expect them to be present, they are not):
garbageFilesInFileMetaData = garbageFilesInProjectsWorkbench.union(set(file_meta_data_files_NOT_ON_S3)).union(set(file_meta_data_files_filesizeMinusOne_NOT_ON_S3))
#
# we built so-called boolean-lists, which is a way to filter pandas dataframes.
# Get list of projects that contain damaged files
maskProjectsContainingFilesMissingOnS3 = projects_df.uuid.isin([i.split('/')[0] for i in garbageFilesInFileMetaData])
#############################
#
#
# To be safe, copy, the dataframe
files_meta_data_df_copy = copy.deepcopy(files_meta_data_df)

### MASKS
maskAssocProjectNotInProjectDB = ~files_meta_data_df_copy.project_id.isin(projects_df['uuid'])
maskAssocProjectValid = ~maskAssocProjectNotInProjectDB
maskProjectValidAndNodeNonExistant = files_meta_data_df_copy['object_name'].apply(lambda x: sum(projects_df.uuid.isin([x.split("/")[0]])) == 1 \
                                                                                                and not x.split("/")[1] in json.dumps(projects_df[projects_df.uuid.isin([x.split("/")[0]])].iloc[0]['workbench']))
maskProjectNonExistantOrNodeNonExistant = files_meta_data_df_copy['object_name'].apply(lambda x: sum(projects_df.uuid.isin([x.split("/")[0]])) == 0 or (sum(projects_df.uuid.isin([x.split("/")[0]])) == 1 \
                                                                                                and not x.split("/")[1] in json.dumps(projects_df[projects_df.uuid.isin([x.split("/")[0]])].iloc[0]['workbench'])))
maskDoesntContainLog = files_meta_data_df_copy['object_name'].apply(lambda x: "log" not in x)
maskDoesntContainZip = files_meta_data_df_copy['object_name'].apply(lambda x: "zip" not in x)
# These files are API Files with project_id == NULL, dont delete them
maskTheseAreAPIFiles = ~files_meta_data_df_copy.apply(lambda x: str(x.project_id) in str(x.object_name), axis=1) | files_meta_data_df_copy['file_id'].apply(lambda x: "api/" in x) | files_meta_data_df_copy.is_soft_link.isin(["1"])
maskObjectNotReferencedInProjectsWorkbench = ~files_meta_data_df_copy.object_name.isin(foundFilePathsInProjectsTable_WorkbenchInputs + foundFilePathsInProjectsTable_WorkbenchOutputs)
maskObjectReferencedInProjectWorkbench = ~maskObjectNotReferencedInProjectsWorkbench
maskFileIsMissingOnS3 = files_meta_data_df_copy.object_name.isin(garbageFilesInFileMetaData)
#
# After a talk with Sylvain, we filter a bit more: 
# A file present on file_meta_data, present on S3, with a valid projectID, not referenced in projects/workbench is NOT an indication of a broken project
# if: The file is not a zip file AND the filename does not contain "log"
# else: The entry in file_meta_data can be deleted
maskCanBeDeleted1 = maskAssocProjectValid & maskObjectNotReferencedInProjectsWorkbench
maskCanBeDeleted1 = maskCanBeDeleted1 & maskDoesntContainLog
maskCanBeDeleted1 = maskCanBeDeleted1 & maskDoesntContainZip
maskCanBeDeleted1 = maskCanBeDeleted1 & ~maskTheseAreAPIFiles
#
#
maskCanBeDeleted2 = maskProjectValidAndNodeNonExistant & ~maskFileIsMissingOnS3
maskCanBeDeleted2 = maskCanBeDeleted2 & maskObjectNotReferencedInProjectsWorkbench
#
maskCanBeDeleted3 = maskProjectNonExistantOrNodeNonExistant & ~maskTheseAreAPIFiles 
maskCanBeDeleted3 = maskCanBeDeleted3 & maskObjectNotReferencedInProjectsWorkbench
#
#
#
maskFilesExpectedValid = ~maskCanBeDeleted1 & ~maskCanBeDeleted2 & ~maskCanBeDeleted3
maskCanBeDeleted = ~maskFilesExpectedValid
#
maskFilesInFileMetaData_ExpectedValidButActuallyBroken = maskFilesExpectedValid & maskFileIsMissingOnS3
#
#
print("Done...")

In [None]:
print("########")
print("Files / db-entries to be cleaned-up:")
print("Number of files in S3 not referenced in the database: ", len(garbageFilesOnS3))
print("    --> These S3 files should be deleted.")
print("Number of file_meta_data entries that are not API files and not referenced in projects/workbench, where assoc. projectID is non-existant: ", sum(maskAssocProjectNotInProjectDB & ~maskTheseAreAPIFiles & maskObjectNotReferencedInProjectsWorkbench))
print("Number of file_meta_data entries that are not API files and not referenced in projects/workbench, with a valid projectID, and filename doesn't contain log or zip: ",sum(maskCanBeDeleted1))
print("Number of file_meta_data entries that are not API files and not referenced in projects/workbench, with a valid projectID, but the nodeID is non-existant: ",sum(maskCanBeDeleted2))
print("    --> These DB entries and associated S3 files should be deleted")
print("Number of files missing in S3 but referenced in file_meta_data entries: ",sum(maskFileIsMissingOnS3))
print("    ... from these files, number of files that we expected valid: ", sum(maskFilesInFileMetaData_ExpectedValidButActuallyBroken))
print("    --> These DB entries should be deleted, and we should be aware of projects that still point to these files.")
print("TOTAL NUMBER of file_meta_data entries to be deleted: ", sum(maskCanBeDeleted) )
print("########")
print("Broken / missing / inconsistent files or projects:")
print("Number of files referenced in projects/workbench not present in S3: ", len(garbageFilesInProjectsWorkbench))
print("    --> The assoc. projects might not work anymore and the user should be informed.")
print("Number of healable files, where files are present on S3 but file_size in file_meta_data is -1 : ", len(file_meta_data_files_filesizeMinusOne_ON_S3))
print("    --> These DB entries should be updated with the proper filesize.")
print("Number of projects with associated files in the file_meta_data table that are missing in S3: ",sum(maskProjectsContainingFilesMissingOnS3))
print("    --> These projects *might* not work anymore and the user should be informed.")
print("Number of invalid file_meta_data entries referenced by projects: ",sum(maskFilesInFileMetaData_ExpectedValidButActuallyBroken))
print("    --> These projects *will* not work anymore and the user should be informed.")
####
if "ASSESS_INCONSISTENT_DATA_CMD_RUN" in os.environ:
    returncode = len(garbageFilesOnS3)
    returncode += sum(maskAssocProjectNotInProjectDB & ~maskTheseAreAPIFiles & maskObjectNotReferencedInProjectsWorkbench)
    returncode += sum(maskCanBeDeleted1)
    returncode += sum(maskCanBeDeleted2)
    returncode += sum(maskFileIsMissingOnS3)
    returncode += sum(maskFilesInFileMetaData_ExpectedValidButActuallyBroken)
    returncode += sum(maskCanBeDeleted)
    returncode += len(garbageFilesInProjectsWorkbench)
    returncode += len(file_meta_data_files_filesizeMinusOne_ON_S3)
    returncode += sum(maskProjectsContainingFilesMissingOnS3)
    returncode += sum(maskFilesInFileMetaData_ExpectedValidButActuallyBroken)
    if returncode != 0:
        print("DATA INCONSISTENCIES DETECTED")
        exit(1) # This is invalid ipython syntax, which will cause an error, which in turn causes the ipython command to fail with exit code 1. :)
exit(1)

## GUI and Analysis: Which user has broken projects?

In [None]:
# DEFINE HELPER FOR DROPDOWN
# via https://towardsdatascience.com/bring-your-jupyter-notebook-to-life-with-interactive-widgets-bc12e03f0916
ALL = 'ALL'
def unique_sorted_values_plus_ALL(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    return unique
# Generate a list of users for the dropdown
usersList = session.query(users).order_by(users.c.email)



# DamagedProjects contains projects with files referenced in projects/workbench that are not there
# Collect projects entries where damaged files are present
damagedProjects = []
for i,curItem in enumerate(garbageFilesInProjectsWorkbench):
    if curItem in foundFilePathsInProjectsTable_WorkbenchInputs:
        curIndex = foundFilePathsInProjectsTable_WorkbenchInputs.index(curItem)
        projEntry = foundInstances_WorkbenchInput[curIndex]
    else:
        curIndex = foundFilePathsInProjectsTable_WorkbenchOutputs.index(curItem)
        projEntry = foundInstances_WorkbenchOutput[curIndex]
    damagedProjects.append(projEntry)


damagedProjectsUUIDs = [i.uuid for i in damagedProjects]

file_meta_data_df_broken_and_referenced = files_meta_data_df_copy[maskFilesInFileMetaData_ExpectedValidButActuallyBroken]
# Add projects with broken file_meta_data entries
boolean_series = projects_df.uuid.isin(damagedProjectsUUIDs)
boolean_series = boolean_series | projects_df.uuid.isin(file_meta_data_df_broken_and_referenced.project_id)
# Add projects with links to broken files in workbench
for i in file_meta_data_df_broken_and_referenced.project_id.tolist():
    boolean_series = boolean_series | projects_df.workbench.str.contains(str(i), case=False)
# Add projects with associated broken file_meta_data files as matched by the projectUUID in the S3-filename
for i in file_meta_data_df_broken_and_referenced.object_name.tolist():
    boolean_series = boolean_series | projects_df.uuid.str.contains(str(i).split('/')[0], case=False)

# Pandas Dataframe: Containing Broken projects
df_garbage_in_database = projects_df[boolean_series]

# Built dropdown...
usersListFilter = users_df.id.isin(df_garbage_in_database.prj_owner)
users_df = users_df[usersListFilter]
dropdown_user = widgets.Dropdown(options  = unique_sorted_values_plus_ALL(users_df.email))
print("Browse projects with missing files here, by project owner:")
display(dropdown_user)
output_user = widgets.Output()
display(output_user)
ListOfCurrentSelectedProjectUUIDs = []
def dropdown_user_eventhandler(change):
    output_user.clear_output()
    if (change.new == ALL):
        with output_user:
            display(df_garbage_in_database)
    else:
        with output_user:
            # Now we need to go the reverse way, and for a given projectID find the broken files.
            # Probably I should have gone for a hashtable right from the start, but oh well, here we are...
            userID = users_df.loc[users_df['email'] == change.new].iloc[0].id
            is_user_project = df_garbage_in_database['prj_owner']==userID
            #booleanList = file_meta_data_df_broken_and_referenced.project_id.isin(df_garbage_in_database[is_user_project].uuid.tolist())
            #for i in df_garbage_in_database[is_user_project].uuid.tolist():
            #    booleanList = booleanList | file_meta_data_df_broken_and_referenced.object_name.str.contains(str(i), case=False)
            display(df_garbage_in_database[is_user_project].sort_values(by="last_change_date",ascending=False))
            
            for index, row in df_garbage_in_database[is_user_project].iterrows():
                linkedFileList = []
                workbenchItems = df_garbage_in_database[is_user_project].at[index,'workbench']
                # For each project, get list of files referenced in project/workbench
                for key in workbenchItems.keys():
                    inputsOutputsPresent = False
                    if 'inputs' in workbenchItems[key].keys() and workbenchItems[key]['inputs'] != {}:
                        inputsOutputsPresent = True
                        for j in workbenchItems[key]['inputs'].keys():
                            # Ducktype check if input obj is dict:
                            try:
                                items = workbenchItems[key]['inputs'][j].items()
                            except (AttributeError, TypeError):
                                continue
                            else: # is dict
                                for k in workbenchItems[key]['inputs'][j].keys():
                                    if k == 'path':
                                        linkedFileList.append(workbenchItems[key]['inputs'][j][k])
                    if 'outputs' in workbenchItems[key].keys() and workbenchItems[key]['outputs'] != {}:
                        for j in workbenchItems[key]['outputs'].keys():
                            # Ducktype check if input obj is dict:
                            try:
                                items = workbenchItems[key]['outputs'][j].items()
                            except (AttributeError, TypeError):
                                continue
                            else: # is dict
                                for k in workbenchItems[key]['outputs'][j].keys():
                                    if k == 'path':
                                        linkedFileList.append(workbenchItems[key]['outputs'][j][k])
                        inputsOutputsPresent = True
                uuid = df_garbage_in_database[is_user_project].at[index,'uuid']
                # Get Broken entries in file_meta_data for this project
                fileMetaDataGarbage = file_meta_data_df_broken_and_referenced.project_id.isin([uuid])
                if sum(fileMetaDataGarbage) > 0:
                    print("Broken (missing on S3) files referenced in file_meta_data for project ",uuid," :\n",file_meta_data_df_broken_and_referenced[fileMetaDataGarbage].object_name.tolist())
                strings_with_substring = [string for string in garbageFilesInProjectsWorkbench if uuid in string]
                for i in linkedFileList:
                    strings_with_substring += [string for string in garbageFilesInProjectsWorkbench if i in string]
                strings_with_substring = list(set(strings_with_substring))
                if len(strings_with_substring) > 0:
                    print("Broken (missing on S3) files referenced in projects/workbench for project ",uuid," :\n",strings_with_substring)

                
dropdown_user.observe(dropdown_user_eventhandler, names='value')

--------------
# Delete S3 files

In [None]:
saveDataPriorToDeletion = True
print("Will delete ",len(garbageFilesOnS3), " files on S3 unreferenced in osparc (not referenced in file_meta_data or projects/workbench).")
if input("Type YES to backup data to another bucket before deletion: ") == "YES":
    print("Will back up data prior to deletion.")
    saveDataPriorToDeletion = True
else:
    saveDataPriorToDeletion = False
    print("Will NOT back up data prior to deletion.")
if saveDataPriorToDeletion:
    dest_s3 = boto3.resource('s3',
                        endpoint_url='',
                        aws_access_key_id='',
                        aws_secret_access_key='',
                        #config=Config(signature_version='s3v4'),
                        region_name='us-east-1',
                        verify=False)
    saveBucketName = 'backupmaster' + datetime.today().strftime('%Y%m%d')
    # Check if bucket already exists, if not, create
    from botocore.client import ClientError
    import time
    try:
        dest_s3.meta.client.head_bucket(Bucket=saveBucketName)
    except ClientError:
        dest_s3.create_bucket(Bucket=saveBucketName)
    #
    time.sleep(0.25)
    dest_bucket = dest_s3.Bucket(saveBucketName)
    
def deleteDataListS3(inputlist,src_bucket,saveDataBucket = None, saveBucketName = None):
    if input("enter YES again to start deletion") == "YES":
        i=0
        pageIncrement = 500
        while i + pageIncrement < len(inputlist):
            print("Processing batch of ",pageIncrement," files...")
            deletionList = []
            for j in range(i, i + pageIncrement):
                deletionList.append(list(inputlist)[j])
            if saveDataBucket != None:
                for j in deletionList:
                    copy_source = {
                        'Bucket': str(saveBucketName),
                        'Key': j
                    }
                    saveDataBucket.copy(copy_source, j)
            time.sleep(0.10)
            deletionDict = {'Objects':[{'Key': i} for i in deletionList]}
            src_bucket.delete_objects(Delete=deletionDict)
            i += pageIncrement
        if i != len(inputlist):
            print("Processing batch of ",len(inputlist) - i," files...")
            deletionList = []
            for j in range(i, len(inputlist)):
                deletionList.append(list(inputlist)[j])
            if saveDataBucket != None:
                for j in deletionList:
                    copy_source = {
                        'Bucket': str(saveBucketName),
                        'Key': j
                    }
                    saveDataBucket.copy(copy_source, j)
            time.sleep(0.50)
            deletionDict = {'Objects':[{'Key': i} for i in deletionList]}
            src_bucket.delete_objects(Delete=deletionDict)
if input("enter YES to start deletion") == "YES":
    now = datetime.now()
    date_time = now.strftime("%Y_%m_%d")
    filenameout = date_time + '_S3Deletion.txt'
    with open(filenameout, 'a') as f:
        for item in garbageFilesOnS3:
            f.write("%s\n" % item)
        f.write("----------\n")
    if saveDataPriorToDeletion:
        deleteDataListS3(garbageFilesOnS3,src_bucket,dest_bucket,saveBucketName)
    else:
        deleteDataListS3(garbageFilesOnS3,src_bucket)
else:
    print("DELETION ABORTED!")

# Delete file_meta_data entries

In [None]:
markedForDeletionInFileMetaData = files_meta_data_df_copy[maskCanBeDeleted].object_name.to_list()
print("Getting ready to delete ",len(markedForDeletionInFileMetaData), " items in file_meta_data table.")
if input("enter YES to delete unused file references in file_meta_data table.") == "YES":
    with engine.connect() as connection:
        file_meta_data_sqla = file_meta_data
        process1 = set(markedForDeletionInFileMetaData).intersection(set(file_meta_data_files).union(file_meta_data_files_filesizeMinusOne_ON_S3))
        process2 = set(markedForDeletionInFileMetaData).intersection(set(file_meta_data_files_NOT_ON_S3).union(file_meta_data_files_filesizeMinusOne_NOT_ON_S3))
        #
        statement = db.delete(file_meta_data_sqla).where(file_meta_data_sqla.c.object_name.in_(list(process2))).execution_options(synchronize_session="fetch")
        print("Executing 1/2")
        result = connection.execute(statement)
        statement = db.delete(file_meta_data_sqla).where(file_meta_data_sqla.c.object_name.in_(list(process1))).execution_options(synchronize_session="fetch")
        print("Executing 2/2")
        result = connection.execute(statement)
else:
    print("DELETION ABORTED!")