## EST DEVLAND

This notebook removes older files and leaves behind the most recent files in each folder afterwhich lists the file paths

In [1]:
import nbimporter
import time
import os
import glob
import pandas as pd

In [2]:
directory = r'R:\RawData\Elite Star\Devland\RAW FILES RECEIVED'

In [3]:
def remove_except_recent(directory, days):
    # Get a list of all files in the directory and its subdirectories
    files = []
    for root, dirs, filenames in os.walk(directory):
        for filename in filenames:
            files.append(os.path.join(root, filename))

    # Sort the files by modification time in descending order
    files.sort(key=os.path.getmtime, reverse=True)

    # Keep track of the most recent file in each directory
    most_recent_files = {}

    # Keep track of the deleted files
    deleted_files = []

    # Keep track of the kept files
    kept_files = []

    # Calculate the time threshold for deletion
    threshold_time = time.time() - days * 24 * 60 * 60

    # Keep the most recent file in each directory and delete the rest
    for file in files:
        # Get the directory of the file
        directory = os.path.dirname(file)

        # If there is only one file in the directory, skip it
        if len(glob.glob(os.path.join(directory, "*"))) == 1:
            kept_files.append(os.path.basename(file))
            continue

        # If the directory is not in the most_recent_files dictionary, add it
        if directory not in most_recent_files:
            most_recent_files[directory] = file
            kept_files.append(os.path.basename(file))
        else:
            # If the file is older than the most recent file in the directory and older than the threshold, delete it
            if os.path.getmtime(file) < os.path.getmtime(most_recent_files[directory]) and os.path.getmtime(file) < threshold_time:
                os.remove(file)
                deleted_files.append(os.path.basename(file))
            else:
                # Otherwise, update the most recent file in the directory
                os.remove(most_recent_files[directory])
                deleted_files.append(os.path.basename(most_recent_files[directory]))
                most_recent_files[directory] = file
                kept_files.append(os.path.basename(file))

    # Keep the most recent file in each directory
    for directory, most_recent_file in most_recent_files.items():
        files_in_directory = glob.glob(os.path.join(directory, "*"))
        for file in files_in_directory:
            if file != most_recent_file and os.path.getmtime(file) < threshold_time:
                os.remove(file)
                deleted_files.append(os.path.basename(file))
            else:
                kept_files.append(os.path.basename(file))

    # Create a Pandas DataFrame of the deleted files
    df_deleted = pd.DataFrame(deleted_files, columns=["Deleted Files"])
    df_deleted["Deleted Files"] = df_deleted["Deleted Files"].apply(lambda x: os.path.basename(x))
    df_deleted = df_deleted.sort_values(by=["Deleted Files"])

    # Create a Pandas DataFrame of the kept files with only the last two elements in the file path
    df_kept = pd.DataFrame(kept_files, columns=["Kept Files"])
    df_kept["Kept Files"] = df_kept["Kept Files"].apply(lambda x: os.path.join(*os.path.split(x)[-2:]))
    df_kept = df_kept.drop_duplicates()
    
    return df_deleted, df_kept

In [4]:
def list_all_files(directory):
    file_paths = []

    for root, dirs, filenames in os.walk(directory):
        for filename in filenames:
            relative_path = os.path.relpath(os.path.join(root, filename), directory)
            file_paths.append(relative_path)

    df = pd.DataFrame(file_paths, columns=['File Path'])

    return df

In [5]:
def list_most_recent_files(directory):
    recent_files = []

    for root, dirs, filenames in os.walk(directory):
        if filenames:
            most_recent_file = max(filenames, key=lambda x: os.path.getmtime(os.path.join(root, x)))
            relative_path = os.path.relpath(os.path.join(root, most_recent_file), directory)
            folder_name = os.path.basename(root)
            recent_files.append([folder_name, relative_path])

    df = pd.DataFrame(recent_files, columns=['Folder Name', 'Most Recent File'])

    return df


In [6]:
all_files    = list_all_files(directory)
recent_files = list_most_recent_files(directory)

In [7]:
with pd.ExcelWriter("C:\\Users\\tsello01\\Documents\\Data\\Devland Latest.xlsx", engine='xlsxwriter') as writer:
    all_files.to_excel(writer, sheet_name='All Files', index=False)
    recent_files.to_excel(writer, sheet_name='Most Recent Files', index=False)