# Administration: List projects ordered by size

## Clone oSparc Repo and install python dependencies

FYI: Output is supressed here for better readability. To debug, remove the '%%capture' at the end of the lines.

In [None]:
%%capture
!git clone https://github.com/ITISFoundation/osparc-simcore.git
!python -m pip install requests
!python -m pip install sqlalchemy
!python -m pip install 
!python -m pip install boto3
!python -m pip install tqdm
!cd osparc-simcore/packages/postgres-database && pip install .

## Config: Endpoints and Credentials

### PG config variables

In [None]:
PG_PASSWORD = 'testertester'
PG_ENDPOINT='postgres.osparc.test:5432'
PG_DB='simcoredb'
PG_USER='postgres_osparc'

### S3 config variables

In [None]:
from getpass import getpass
sourceendpointurl = 'https://storage.osparc.local' # this needs to have a prefix, i.e. https://
sourcebucketname = 'master-simcore'
sourcebucketaccess = 'testertester'
sourcebucketsecret= 'testertester'

## Begin: Main script
### Import python mocules

In [None]:
# Vanilla Python
import os
import sys
import json
import copy
import importlib
import time
import math
from pathlib import Path
from collections import Counter
from datetime import datetime
# S3
import warnings
import boto3
from botocore.client import Config
# pgSQL
import sqlalchemy as db
from sqlalchemy.orm import sessionmaker
import psycopg2
# Pandas and Widgets
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
# tqdm progressbar
from tqdm.notebook import tqdm
# Osparc-Simcore
import simcore_postgres_database
from  simcore_postgres_database.models.projects import projects
from simcore_postgres_database.models.users import users
from simcore_postgres_database.models.file_meta_data import file_meta_data
import urllib3


### Connect to pgSQL

In [None]:
pgEngineURL= "postgresql://{user}:{password}@{host}:{port}/{database}".format(
        user=PG_USER,
        password=PG_PASSWORD,
        database=PG_DB,
        host=PG_ENDPOINT.split(":")[0],
        port=int(PG_ENDPOINT.split(":")[1]),
    )
engine = db.create_engine(pgEngineURL)
Session = sessionmaker(bind=engine)
session = Session()
metadata = db.MetaData()
####
# Get database tables as pandas df objects
users_df = pd.read_sql_table(
    'users',
    con=engine
)
projects_df = pd.read_sql_table(
    'projects',
    con=engine
)
files_meta_data_df = pd.read_sql_table(
    'file_meta_data',
    con=engine
)

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


### Iterate different files per projects and their versions

In [None]:
listProjectswithFiles = []

result = session.query(projects, file_meta_data).join(file_meta_data, projects.c.uuid == file_meta_data.c.project_id).all()

for row in result:
    exist = False
    for pro in listProjectswithFiles:
        if pro["project_uuid"] == row.uuid:
            exist = True
            list_files = pro["list_files"]
            list_files.append(row.object_name)
            pro["list_files"] = list_files
            break
    if exist == False:
        proj = {"project_uuid" : row.uuid, "project_title" : row.name, "project_last_update" : row.last_change_date, "project_own" : row.prj_owner, "total_size" : 0, "list_files" : [row.object_name]}
        listProjectswithFiles.append(proj)

        
# Now we look for the actual size of the files including all their versions

# Configure source bucket
# via https://docs.min.io/docs/how-to-use-aws-sdk-for-python-with-minio-server.html
s3_client = boto3.client('s3',
                    endpoint_url=sourceendpointurl,
                    aws_access_key_id=sourcebucketaccess,
                    aws_secret_access_key=sourcebucketsecret,
                    config=Config(signature_version='s3v4'),
                    region_name='us-east-1',
                    verify=False)
#src_bucket = s3_client.Bucket(sourcebucketname)


for project in  listProjectswithFiles:
    for file in project["list_files"]:
        try:
            result = s3_client.list_object_versions(Bucket=sourcebucketname, Prefix=file)
            if "Versions" in result:
                for version in result["Versions"]:
                    project["total_size"] = project["total_size"] + int(version["Size"])
    
        except Exception as e:
            raise Exception("Exception " +  e.__str__())




### Order project by size

In [None]:
def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

listProjectswithFiles = sorted(listProjectswithFiles, key=lambda d: d['total_size'], reverse=True) 

for project in listProjectswithFiles:
    project["total_size"] = convert_size(project["total_size"])
    print(project["total_size"])

## Print results

In [None]:
print("Done!")
print("##########")
for project in listProjectswithFiles:
    print("Total size : " + str(project["total_size"]) + " - UUID : " + str(project["project_uuid"]) + " - Last update : " + str(project["project_last_update"]) + " - Owner : " + str(project["project_own"]))