In [37]:
import nbimporter
import time
import os
import glob
import pandas as pd

In [38]:
%run react.ipynb

In [39]:
directory = r'R:\RawData\Elite Star\Devland\RAW FILES RECEIVED'

In [40]:
def remove_except_recent(directory, days):
    # Get a list of all files in the directory and its subdirectories
    files = []
    for root, dirs, filenames in os.walk(directory):
        for filename in filenames:
            files.append(os.path.join(root, filename))

    # Sort the files by modification time in descending order
    files.sort(key=os.path.getmtime, reverse=True)

    # Keep track of the most recent file in each directory
    most_recent_files = {}

    # Keep track of the deleted files
    deleted_files = []

    # Keep track of the kept files
    kept_files = []

    # Calculate the time threshold for deletion
    threshold_time = time.time() - days * 24 * 60 * 60

    # Keep the most recent file in each directory and delete the rest
    for file in files:
        # Get the directory of the file
        directory = os.path.dirname(file)

        # If there is only one file in the directory, skip it
        if len(glob.glob(os.path.join(directory, "*"))) == 1:
            kept_files.append(os.path.basename(file))
            continue

        # If the directory is not in the most_recent_files dictionary, add it
        if directory not in most_recent_files:
            most_recent_files[directory] = file
            kept_files.append(os.path.basename(file))
        else:
            # If the file is older than the most recent file in the directory and older than the threshold, delete it
            if os.path.getmtime(file) < os.path.getmtime(most_recent_files[directory]) and os.path.getmtime(file) < threshold_time:
                os.remove(file)
                deleted_files.append(os.path.basename(file))
            else:
                # Otherwise, update the most recent file in the directory
                os.remove(most_recent_files[directory])
                deleted_files.append(os.path.basename(most_recent_files[directory]))
                most_recent_files[directory] = file
                kept_files.append(os.path.basename(file))

    # Keep the most recent file in each directory
    for directory, most_recent_file in most_recent_files.items():
        files_in_directory = glob.glob(os.path.join(directory, "*"))
        for file in files_in_directory:
            if file != most_recent_file and os.path.getmtime(file) < threshold_time:
                os.remove(file)
                deleted_files.append(os.path.basename(file))
            else:
                kept_files.append(os.path.basename(file))

    # Create a Pandas DataFrame of the deleted files
    df_deleted = pd.DataFrame(deleted_files, columns=["Deleted Files"])
    df_deleted["Deleted Files"] = df_deleted["Deleted Files"].apply(lambda x: os.path.basename(x))
    df_deleted = df_deleted.sort_values(by=["Deleted Files"])

    # Create a Pandas DataFrame of the kept files with only the last two elements in the file path
    df_kept = pd.DataFrame(kept_files, columns=["Kept Files"])
    df_kept["Kept Files"] = df_kept["Kept Files"].apply(lambda x: os.path.join(*os.path.split(x)[-2:]))
    df_kept = df_kept.drop_duplicates()
    
    return df_deleted, df_kept



In [41]:
def list_all_files(directory):
    file_paths = []

    # Walk through the directory
    for root, dirs, filenames in os.walk(directory):
        for filename in filenames:
            # Get the relative path of the file from the directory
            relative_path = os.path.relpath(os.path.join(root, filename), directory)
            
            file_paths.append(relative_path)

    df = pd.DataFrame(file_paths, columns=['File Path'])

    return df

In [42]:
delete_df, kept_df = delete_all_except_recent(directory)

In [43]:
remaining_files = list_all_files(directory)

In [44]:
delete_df

Unnamed: 0,Deleted Files
3,SaleAudit-2022-02-14.csv
2,SaleAudit-2022-02-21.csv
71,SaleAudit-2022-12-19.csv
70,SaleAudit-2022-12-26.csv
69,SaleAudit-2023-01-02.csv
...,...
50,SaleAudit_Metcash Trading Africa (Pty) Ltd TA ...
26,SaleAudit_Metcash Trading Africa (Pty) Ltd TA ...
99,SaleAudit_Newtown Cash & Carry_2023-05-22.csv
54,SaleAudit_Newtown Cash & Carry_2023-05-29.csv


In [45]:
kept_df

Unnamed: 0,Kept Files
0,SaleAudit-2023-06-12.csv
1,SaleAudit_Devland Springs Retail_2023-06-12.csv
6,SaleAudit_FRONTLINE HAMMANSKRAL_2023-06-12.csv
7,SaleAudit_Devland Springs Wholesale_2023-06-12...
9,SaleAudit_Devland Hyper - Mitchells Plain_2023...
10,SaleAudit_FRONTLINE HYPER HILLFOX_2023-06-12.csv
11,SaleAudit_DEVLAND CnC - WELKOM_2023-06-12.csv
12,SaleAudit_FRONTLINE PRETORIA_2023-06-12.csv
13,SaleAudit_DEVLAND HYPER NTABANKULU_2023-06-12.csv
14,SaleAudit_DEVLAND KOKSTAD (PTY) LTD_2023-06-12...


In [47]:
remaining_files.to_excel("C:\\Users\\tsello01\\Documents\\Data\\devland_files(3).xlsx", index=False)
