In [None]:
root_directory = 'D:\\'

In [None]:
from multiprocessing.dummy import Pool
from pathlib import Path
import pandas as pd
from dask.utils import format_bytes
import plotly.express as px

In [None]:
def gen_file_row(f):
  return {'path':str(f),'filename':f.name,'directory':str(f.parent),'type':f.suffix,'bytes':f.stat().st_size}, f.stat().st_size

def process_dir(d):
  paths = []
  dir_size = 0
  try:
    for p in d.iterdir():
      try:
        if p.is_dir():
          res, b = process_dir(p)
          paths = paths + res
          dir_size += b
        elif p.is_file():
          res, b = gen_file_row(p)
          paths.append(res)
          dir_size += b
      except (FileNotFoundError, PermissionError):
        continue
  except (FileNotFoundError, PermissionError):
    paths.append({'path':str(d),
                  'filename':d.name if d.name != '' else str(d),
                  'directory':str(d.parent) if d.parent != d else "",
                  'type':'directory',
                  'bytes':0})
    return paths, 0
  paths.append({'path':str(d),
                'filename':d.name if d.name != '' else str(d),
                'directory':str(d.parent) if d.parent != d else "",
                'type':'directory',
                'bytes':dir_size})
  return paths, dir_size

def process_with_Pool(d):
  pool = Pool()
  glob_list = list(d.glob("**/*"))
  dir_list  = [{'path':x.name,'directory':str(x.parent),'type':'directory','bytes':0} for x in glob_list if x.is_dir()]
  print(dir_list[0:5])
  file_list = [x for x in glob_list if not x.is_dir()]
  res = pool.map(gen_file_row, file_list)
  return dir_list + res

In [None]:
import pandas as pd
#data=process_dir_Pool(Path(root_directory))
data, __ =process_dir(Path(root_directory))
df = pd.DataFrame(data)
df['filesize'] = df['bytes'].apply(lambda x: format_bytes(x))

In [None]:
with pd.option_context('display.max_colwidth', None):
  largest_dir = df.loc[(df['type'] == 'directory') & (df['path'] != root_directory)].sort_values(by=['bytes','directory'],ascending=[False,True]).head(10).reset_index()
  largest_dir.index = range(1,11)
  print("10 largest directories")
  display(largest_dir[['path','directory','filesize']])

In [None]:
with pd.option_context('display.max_colwidth', None):
  print("10 largest files")
  largest_files = df.loc[df['type'] != 'directory'].nlargest(10,'bytes').reset_index(drop=True)
  largest_files.index = range(1,11)
  display(largest_files[['filename','directory','type','filesize']])

In [None]:
fig = px.treemap(df.nlargest(100,'bytes').sort_values(by='directory'),
        branchvalues = "total",
        names = 'filename',
        ids = 'path',
        parents = 'directory',
        values = 'bytes',
        title = '100 largest directories/files',
        hover_data = {'filename':True,'path':False,'bytes':False,'filesize':True,'directory':True,'type':True},
        color = 'type'
)
#fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()