## Folder Structure

Review the contents with network folder and track details

Compare the file directories

In [None]:
# https://techoverflow.net/2024/07/23/how-to-fix-numpy-dtype-size-changed-may-indicate-binary-incompatibility-expected-96-from-c-header-got-88-from-pyobject/
# pip install numpy==1.26.4

In [None]:
# Import libraries
import time
import pandas as pd
from pathlib import Path
from typing import Dict, List
import numpy as np

In [None]:
class NetworkFolderFileReview:
    def __init__(self, directory: str, exclusion_name: str = 'Thumbs'):
        """
        Initializes the class with the directory path and exclusion criteria.

        :param directory: str - Path to the directory to review.
        :param exclusion_name: str - Name of files to exclude from the review.
        """
        self.directory = directory
        self.exclusion_name = exclusion_name
        self.file_details_df = pd.DataFrame()
        self.filtered_df = pd.DataFrame()
        self.network_df = pd.DataFrame()
        self.final_df = pd.DataFrame()

    def get_filepaths(self) -> List[str]:
        """
        Generates a list of full file paths in the directory tree.
        
        :return: List[str] - A list containing full file paths.
        """
        return [str(filepath) for filepath in Path(self.directory).rglob('*') if filepath.is_file()]

    def get_file_details(self, filepath: Path) -> Dict[str, str]:
        """
        Extracts details from the given file path.

        :param filepath: Path - The path to the file.
        :return: Dict[str, str] - A dictionary containing file details.
        """
        created = time.ctime(filepath.stat().st_ctime)
        modified = time.ctime(filepath.stat().st_mtime)
        name, ext = filepath.stem, filepath.suffix[1:]

        return {
            'File_Path': str(filepath.parent),
            'Lst_Folder': filepath.parent.name,
            'Full_Name': str(filepath),
            'Folder_FileName': str(filepath.relative_to(self.directory)),
            'Name': name,
            'Ext': ext,
            'Created_dt': created,
            'Modified_dt': modified
        }

    def generate_file_details_df(self) -> pd.DataFrame:
        """
        Generates a DataFrame containing details of files in the directory tree.

        :return: pd.DataFrame - DataFrame containing file details.
        """  
        file_details_list = [self.get_file_details(filepath) for filepath in Path(self.directory).rglob('*') if filepath.is_file()]
        self.file_details_df = pd.DataFrame(file_details_list)
        return self.file_details_df

    def filter_files(self):
        """
        Filters out files based on exclusion criteria.
        """
        if self.file_details_df.empty:
            raise ValueError("No files to filter. Please run generate_file_details_df() first.")
        self.filtered_df = self.file_details_df.loc[self.file_details_df.Name != self.exclusion_name]
    
    def load_network_df(self, csv_path: str):
        """
        Loads the confirmed network folder data from a CSV file.

        :param csv_path: str - Path to the CSV file containing network folder data.
        """
        self.network_df = pd.read_csv(csv_path)

    def merge_and_compare(self):
        """
        Merges filtered file details with the network data and performs comparison.
        """
        if self.filtered_df.empty or self.network_df.empty:
            raise ValueError("Both filtered_df and network_df must be populated before merging.")

        data_p1 = self.filtered_df.merge(
            self.network_df,
            how="left",
            on="Folder_FileName",
            suffixes=('_New', '_Orig')
        )

        self.final_df = data_p1.assign(
            create_dt_diff_check=lambda df_: np.where(df_.Created_dt_Orig == df_.Created_dt_New, 1, 0),
            mod_dt_diff_check=lambda df_: np.where(df_.Modified_dt_Orig == df_.Modified_dt_New, 1, 0)
        )
        
        # Filter final_df to include only rows where create_dt_diff_check == 0
        self.final_df = self.final_df[self.final_df['create_dt_diff_check'] == 0]
        
        # Keep only columns from Folder_FileName onwards
        folder_filename_index = self.final_df.columns.get_loc("Folder_FileName")
        self.final_df = self.final_df.iloc[:, folder_filename_index:]

    def export_final_data(self, output_file: str):
        """
        Exports the final DataFrame to an Excel file.

        :param output_file: str - The output file path for the Excel file.
        """
        if self.final_df.empty:
            raise ValueError("No data to export. Please run merge_and_compare() first.")

        self.final_df.to_excel(output_file, index=False)
        print(f"Final output saved to {output_file}")

# Example of usage
if __name__ == "__main__":
    directory_path = 'directory_path'  # Update with your actual directory path
    network_csv_path = 'network_folder_confirmed.csv'  # Update with your actual CSV path
    output_file_path = f'network_folder_{pd.to_datetime("today").date()}.xlsx'

    review = NetworkFolderFileReview(directory_path)
    review.generate_file_details_df()
    review.filter_files()
    review.load_network_df(network_csv_path)
    review.merge_and_compare()
    review.export_final_data(output_file_path)

In [None]:
review.final_df.shape

In [None]:
review.final_df.sample(5)

In [None]:
review.filtered_df.groupby(['Ext'])['Folder_FileName'].count().sort_values(ascending=False).head(10)

In [None]:
review.final_df.groupby(['Ext_New'])['Folder_FileName'].count().sort_values(ascending=False).head(10)

In [None]:
review.final_df.groupby(['create_dt_diff_check', 'mod_dt_diff_check'])['Folder_FileName'].count()