<a href="https://colab.research.google.com/github/MaheshDU48/data-handling-and-visualization/blob/main/file_integrity_checker_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os, time, sys
import hashlib
import psutil

process = psutil.Process(os.getpid())

def md5(fname):
    hash_md5 = hashlib.md5()
    try:
        with open(fname, "rb") as f:
            chunk_size = 4096
            while chunk := f.read(chunk_size):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()
    except PermissionError:
        print(f"Permission denied for file {fname}", flush=True)
    except FileNotFoundError:
        print(f"File not found: {fname}", flush=True)
    except IOError as e:
        print(f"I/O error({e.errno}) for file {fname}: {e.strerror}", flush=True)
    except Exception as e:
        print(f"Error processing file {fname}: {e}", flush=True)
    return None

def get_memory_usage():
    memory_info = process.memory_info()
    memory_used_mb = memory_info.rss / 1024 / 1024
    return memory_used_mb


def walk_and_check_hashes(directory, hash_file_path):
    hash_set = set()
    files_processed = 0
    total_files = 0
    found_match = False

    initial_usage = get_memory_usage()
    print(f"Initial memory usage: {initial_usage:.2f} MB")


    start_time = time.time()

    try:
        with open(hash_file_path, 'r') as hash_file:
            for line in hash_file:
                hash_set.add(line.strip())
    except Exception as e:
        print(f"Error loading hash file: {e}", flush=True)
        return

    for root, dirs, files in os.walk(directory):
        total_files += len(files)

    print(f"Total files to be scanned: {total_files}")


    for root, dirs, files in os.walk(directory):
        for name in files:

            current_usage = get_memory_usage()
            if current_usage > MEMORY_THRESHOLD_MB:
                print(f"\nWarning: High memory usage detected - {current_usage:.2f} MB")

            file_path = os.path.join(root, name)
            file_hash = md5(file_path)

            if file_hash in hash_set:
                print(f"\033[91mHash match found for {file_path}\033[0m", flush=True)
                found_match = True

            files_processed += 1
            if files_processed % 10 == 0:
                print(f"\rProcessed {files_processed} files...", flush=True)

    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"\nFinished processing. Total files processed: {files_processed}")
    print(f"Number of hashes used for comparison: {len(hash_set)}")
    print(f"Time taken: {elapsed_time:.2f} seconds")

    if not found_match:
        print("No matching hashes found.")


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: <script> <directory_to_scan> <hash_file_path>")
    else:
        directory_to_scan = sys.argv[1]
        hash_file_path = sys.argv[2]
        MEMORY_THRESHOLD_MB = 300
        walk_and_check_hashes(directory_to_scan, hash_file_path)



Initial memory usage: 114.73 MB
Total files to be scanned: 0

Finished processing. Total files processed: 0
Number of hashes used for comparison: 12
Time taken: 0.00 seconds
No matching hashes found.
