In [None]:
"""This is an improved version of my previous folder mapping program designed to walk iteratively through a given folder tree and map all files. The goal of this program was to ensure complete data integrity after the movement of a large file structure to a new shared drive. The output of this program is three separate .csv files, one for all files in a given directory that were successfully mapped, one for all files (along with the folder pathway) that failed to map correctly, and a third .csv ranking which folders had the most failed files."""

In [None]:
import os, time, datetime, pandas as pd

In [None]:
folder_path = r'C:\\'

def sizeof_file(num, suffix='B'):
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f}Yi{suffix}"

In [None]:
def crawl_folder(inputDir):
    exclude = []
    rows = []
    errors = []
    for root, dirs, files in os.walk(inputDir):
        dirs[:] = [d for d in dirs if d not in exclude]
        for f in files:
            try:
                fname = os.path.join(root, f)
                rows.append([root, f, os.path.splitext(fname)[-1], time.strftime("%d-%b-%Y %H:%M:%S", time.localtime(os.path.getmtime(fname))), sizeof_file(os.path.getsize(fname)), os.path.getsize(fname)])
            except:
                errors.append([root, f])
                print("Error on:", root, f)
                continue
    
    dfGood = pd.DataFrame(rows, columns=["Directory", "Filename", "Extension", "Last Modified Date", "Size", "Size(Bytes)"])
    dfBad = pd.DataFrame(errors, columns=["Failed Files Path", "File"])

    return dfGood, dfBad


In [None]:
starttime = time.time()

goodDF, badDF = crawl_folder(folder_path)

print(f"Finished in {time.time()-starttime} seconds")

In [None]:
fname = folder_path.split('\\')[-1]
ftime = str(datetime.datetime.now())
ftime = ftime.replace(":", "")

goodDF.to_csv(f"{fname}_Files_Successfully_Mapped_{ftime}.csv", index=False)
badDF.to_csv(f"{fname}_Files_Failed_to_Map_{ftime}.csv", index=False)

In [None]:
badDF["Failed Files Path"] = badDF["Failed Files Path"].str.replace(folder_path, "", regex=False)
badDF_grouped = badDF.groupby("Failed Files Path").count().sort_values(["File"], ascending=False)
badDF_grouped.to_csv(f"_{fname}_Folders_Failed_to_Map_{ftime}_Ranked.csv")

In [None]:
len(goodDF)

In [None]:
len(badDF)

In [None]:
len(badDF_grouped)

In [None]:
badDF_grouped