In [None]:
from os import scandir, makedirs
from os.path import join

from posixpath import basename
from collections import Counter

import svg
import numpy as np
import pandas as pd

from google.cloud.storage import Client

# defs

In [18]:
def skim(d):
  data = {'?': 0}
  subd = []
  with scandir(d) as sdir:
    for n in sdir:
      try:
        if n.is_symlink():
          data[n.name] = 'link'
        elif n.is_file():
          data[n.name] = n.stat().st_size
          data['?'] += data[n.name]
        elif n.is_dir():
          subd.append((n.name, n.path))
      except PermissionError:
        data[n.name] = 'Restricted'

  for n, p in subd:
    try:
      data[n] = skim(p)
      data['?'] += data[n]['?']
    except PermissionError:
      data[n] = 'Restricted Dir'
  return data
  
def bucket_skim(bucket):
  data = []
  tree = {'?': 0}
  for b in bucket.list_blobs():
    n = basename(b.name)
    data.append((b.name, n, b.size))

    tree['?'] += b.size
    c = tree
    for p in b.name.split('/')[:-1]:
      p += '/'
      if p not in c:
        c[p] = {}
      c = c[p]
      if '?' not in c:
        c['?'] = 0
      c['?'] += b.size
    c[n] = b.size

  cols = ['name', 'basename', 'size']
  df = pd.DataFrame(dict(zip(cols, zip(*data))))
  df.set_index('name', inplace=True)
  return df, tree

prefs = ['', 'k', 'M', 'G', 'T']
def hsize(bytes):
  if bytes == 0: return '0 b'
  M = int(np.log2(bytes))//10
  V = bytes/(2**(M*10))
  return f'{V:.1f} {prefs[M]}b'


In [3]:
def bow_d(cx, cy, rO, rI, a0, a1):
  a0 %= 360
  a1 %= 360
  A = np.deg2rad([a0%360, a1%360])
  V = np.transpose([np.cos(A), np.sin(A)])
  C = np.array([cx, cy])
  EO = rO * V + C
  EI = rI * V + C

  larger = (a1 - a0)%360 > 180
  return [
    svg.MoveTo(*EO[0]),
    svg.Arc(rO, rO, 0, larger, True, *EO[1]),
    svg.LineTo(*EI[1]),
    svg.Arc(rI, rI, 0, larger, False, *EI[0]),
    svg.ClosePath()
  ]

def arc_d(cx, cy, r, a0, a1):
  a0 %= 360
  a1 %= 360
  A = np.deg2rad([a0%360, a1%360])
  V = np.transpose([np.cos(A), np.sin(A)])
  C = np.array([cx, cy])
  EO = r * V + C

  larger = (a1 - a0)%360 > 180
  return [
    svg.MoveTo(*EO[0]),
    svg.Arc(r, r, 0, larger, True, *EO[1])
  ]

def qbow(r, w, a0, a1):
  return bow_d(0, 0, r-w/2, r+w/2, a0, a1)

def qarc(r, a0, a1):
  return arc_d(0,0,r,a0,a1)


In [4]:
def bargram(tree, level=1, z=0, pre=''):
  p = 0
  bars = []
  fs = []

  if level == 1: bars.append((0, 0, pre, 0, tree['?']))
  for n, v in tree.items():
    if n == '?': continue
    if isinstance(v, dict):
      if v['?'] == 0: continue
      bars.append((level, 0, join(pre, n), p + z, v['?']))
      bars += bargram(v, level+1, p + z, join(pre, n))
      p += v['?']
    elif isinstance(v, int):
      fs.append((n, v))

  for n, v in fs:
    if v == 0: continue
    bars.append((level, 1, join(pre, n), p + z, v))
    p += v
  
  return bars

In [5]:
colors = [['#59158D', '#AD65E5', '#360958'],
          ['#D39613', '#FFCF66', '#845B04'],
          ['#3C1B91', '#8D6BE6', '#220C5A'],
          ['#D35D13', '#FFA166', '#843504']]

def make_bows(bardata, T=360):
  elements = []
  c = Counter()
  for l, t, n, i0, i1 in bardata:
    if l == 0 or i1/T*360 < 2/l: continue
    elements.append(
      svg.Path(
        d=qbow(l*10, 6, i0/T*360, (i0+i1)/T*360),
        fill=colors[l%4][t],
        stroke=colors[l%4][2], stroke_width=0.4,
        elements=[svg.Title(text=f'{n} : {hsize(i1)}')]
      )
    )
    c[l] += 1
  return elements, max(c.keys())

In [19]:
def inspect_bucket(bucket_name, overname=None):
  client = Client()
  bucket = client.bucket(bucket_name)

  fname = overname if overname else bucket_name

  if not bucket.exists():
    print(f'{bucket_name} not accessible. Check the name or the account credentials.')
  
  file_df, size_data = bucket_skim(bucket)
  print(f'Total Bucket Size: {hsize(size_data["?"])}')
  bardata = bargram(size_data)
  arcs, levels = make_bows(bardata, T=bardata[0][-1])
  ext = (levels+1)*10
  canvas = svg.SVG(
    viewBox=svg.ViewBoxSpec(-ext, -ext, ext*2, ext*2),
    width=1000, height=1000,
    elements=arcs
  )
  
  print(f'Saving graphic to inspection/{fname}/file_chart.svg')
  makedirs(f'inspection/{fname}', exist_ok=True)
  with open(f'inspection/{fname}/file_chart.svg', 'w') as out:
    print(canvas, file=out)

  print(f'Saving file table to inspection/{fname}/file_table.tsv')
  file_df.to_csv(f'inspection/{fname}/file_table.tsv', sep='\t')

  bn_stats = file_df.groupby('basename')['size'].agg(('count', 'sum'))
  bn_stats['size'] = bn_stats['sum'].apply(hsize)
  bn_stats.sort_values('sum', ascending=False)
  print(f'Saving basename table to inspection/{fname}/base_table.tsv')
  bn_stats.to_csv(f'inspection/{fname}/base_table.tsv', sep='\t')


# exec

In [23]:
df = pd.read_csv('Aldubayan Google Project Summary - Terra Info.csv')

In [1]:
analyze = """An_Angiosarcoma_RNA_for_DR copy
QATR-1KG-Reference-Panel
MID-Ciliopathies-Exome
QATR-WES
arab_breast_cancer
arab_breast_cancer_study_wes
KAnderson_Multi-Ancestry-PRS_WES_1KG
KAnderson_PC_Germline_Analysis_WES
KAnderson_PCGermline_CohortScale_Dev
KAnderson_PCGermline_SampleScale_Dev
KAnderson_Qatari_WGS_Ref_Curation""".split('\n')

In [34]:
selection = df[df.Name.isin(analyze)][['Name', 'Bucket']]

for i, (n, b) in selection.iterrows():
  inspect_bucket(b, n)
  print('----------------')

Total Bucket Size: 3.5 Tb
Saving graphic to inspection/An_Angiosarcoma_RNA_for_DR copy/file_chart.svg
Saving file table to inspection/An_Angiosarcoma_RNA_for_DR copy/file_table.tsv
Saving basename table to inspection/An_Angiosarcoma_RNA_for_DR copy/base_table.tsv
----------------
Total Bucket Size: 2.2 Tb
Saving graphic to inspection/KAnderson_Multi-Ancestry-PRS_WES_1KG/file_chart.svg
Saving file table to inspection/KAnderson_Multi-Ancestry-PRS_WES_1KG/file_table.tsv
Saving basename table to inspection/KAnderson_Multi-Ancestry-PRS_WES_1KG/base_table.tsv
----------------
Total Bucket Size: 2.3 Tb
Saving graphic to inspection/KAnderson_PC_Germline_Analysis_WES/file_chart.svg
Saving file table to inspection/KAnderson_PC_Germline_Analysis_WES/file_table.tsv
Saving basename table to inspection/KAnderson_PC_Germline_Analysis_WES/base_table.tsv
----------------
Total Bucket Size: 2.3 Tb
Saving graphic to inspection/KAnderson_PCGermline_CohortScale_Dev/file_chart.svg
Saving file table to inspe

# Blob Migration Errors

In [15]:
import archive_blobs as ak
from os.path import join
from os import makedirs
from json import load

In [16]:
wsn = "arab_breast_cancer_study_wes"
akn = "aldubayan-lab-terra-archives"

In [23]:
try:
  wsdata = ak.fc_workspaces.loc[wsn]
except KeyError as e:
  print(f"Workspace {wsn} not found!")
  raise KeyError(f"Workspace {wsn} not found!") from e

where = join("migration", wsn)
makedirs(where, exist_ok=True)

migrator = ak.BucketMigrator(wsdata.bucketName, akn, wsn)

In [18]:
with open(join(where,'misc_migration_problems.json')) as inp:
  problems0 = load(inp)

In [19]:
problems0

{'E': [['17a25dce-08c9-42e9-8672-bee21c004153/depthOfCoverageTest/20836bb3-7694-4c96-b0fe-7d643f8a7806/call-readcounts/pipelines-logs/action/12/stderr',
   'gs://aldubayan-lab-terra-archives/arab_breast_cancer_study_wes/misc_files/17a25dce-08c9-42e9-8672-bee21c004153/depthOfCoverageTest/20836bb3-7694-4c96-b0fe-7d643f8a7806/call-readcounts/pipelines-logs/action/12/stderr'],
  ['669cc49c-a6a5-41c2-8165-d6647ea98ffe/Stargazer/d450bc32-6535-4dfc-a5fc-a4afdca05c31/call-StargazerVCFRun/pipelines-logs/action/14/stderr',
   'gs://aldubayan-lab-terra-archives/arab_breast_cancer_study_wes/misc_files/669cc49c-a6a5-41c2-8165-d6647ea98ffe/Stargazer/d450bc32-6535-4dfc-a5fc-a4afdca05c31/call-StargazerVCFRun/pipelines-logs/action/14/stderr'],
  ['c6372728-e17c-4a0b-b7d2-e9e45fc76a44/CNVGermlineCohortWorkflow/b1a3ccb1-1487-4a0e-8cf3-22a2544e62ff/call-CollectCounts/shard-137/pipelines-logs/action/6/stderr',
   'gs://aldubayan-lab-terra-archives/arab_breast_cancer_study_wes/misc_files/c6372728-e17c-4a0b-

In [20]:
error_file_map = dict(problems0['E'])
migrator.file_map = error_file_map

In [21]:
counts, problems = migrator.migrate_files()

Migrating 12 files:12(C) = 12(D) + 0(M) + 0(E)     

In [None]:
migrator.cleanup_old()

Deleting 605015 files:549078(C) = 549066(D) + 12(E)

In [9]:
migrator.file_map = problems['E']