In [1]:
###Code to monitor the downloading of files and put in the "initial" folder for a specific week###
import os
import time
import shutil
import datetime
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

# --- Configuration ---
DOWNLOADS_FOLDER = os.path.expanduser("~/Downloads")
WORKING_DIRECTORY = os.path.expanduser("~/OneDrive/Documents/Competitor_filings_monitor/Filings/Initial")
FILE_EXTENSION = ".pdf"

# Create the main working directory if it doesn't exist
if not os.path.exists(WORKING_DIRECTORY):
    os.makedirs(WORKING_DIRECTORY)
    print(f"Created working directory: {WORKING_DIRECTORY}")

class PDFHandler(FileSystemEventHandler):
    def on_created(self, event):
        # Only consider files (not directories) that end with .pdf
        if not event.is_directory and event.src_path.lower().endswith(FILE_EXTENSION):
            print(f"Detected new file: {event.src_path}")
            self.move_file(event.src_path)

    def on_moved(self, event):
        # This event fires when a file is renamed or moved.
        if not event.is_directory and event.dest_path.lower().endswith(FILE_EXTENSION):
            print(f"Detected moved/renamed file: {event.dest_path}")
            self.move_file(event.dest_path)

    def move_file(self, src_path):
        # Determine the current week’s Monday
        today = datetime.date.today()
        # weekday() returns 0 for Monday, so subtract the current weekday to get Monday's date.
        monday = today - datetime.timedelta(days=today.weekday())
        week_folder_name = monday.strftime("%Y-%m-%d")
        week_folder_path = os.path.join(WORKING_DIRECTORY, week_folder_name)

        # Create the subfolder if it doesn't exist yet
        if not os.path.exists(week_folder_path):
            os.makedirs(week_folder_path)
            print(f"Created subfolder for week starting {week_folder_name}")

        # Set the destination path inside the weekly subfolder
        filename = os.path.basename(src_path)
        destination = os.path.join(week_folder_path, filename)

        # Avoid overwriting files by appending a counter if needed
        base, ext = os.path.splitext(filename)
        counter = 1
        while os.path.exists(destination):
            destination = os.path.join(week_folder_path, f"{base}_{counter}{ext}")
            counter += 1

        # Wait until the file is ready (not locked by another process)
        while not self.is_file_ready(src_path):
            time.sleep(1)
        
        try:
            shutil.move(src_path, destination)
            print(f"Moved '{filename}' to '{destination}'")
        except Exception as e:
            print(f"Error moving file '{filename}': {e}")

    def is_file_ready(self, file_path):
        """
        Check if a file is ready by attempting to open it.
        If it’s still being written (or locked), an IOError may occur.
        """
        try:
            with open(file_path, 'rb'):
                return True
        except IOError:
            return False

def main():
    event_handler = PDFHandler()
    observer = Observer()
    observer.schedule(event_handler, path=DOWNLOADS_FOLDER, recursive=False)
    observer.start()
    print(f"Monitoring '{DOWNLOADS_FOLDER}' for new PDF files...")
    
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
        print("Stopping the PDF mover.")
    observer.join()

#if __name__ == "__main__":
#    main()


In [2]:
main()

Monitoring 'C:\Users\lawre/Downloads' for new PDF files...
Detected moved/renamed file: C:\Users\lawre/Downloads\531e97c2-5b90-83e0-4927-2901cd970b99.pdf
Created subfolder for week starting 2025-03-03
Moved '531e97c2-5b90-83e0-4927-2901cd970b99.pdf' to 'C:\Users\lawre/OneDrive/Documents/Competitor_filings_monitor/Filings/Initial\2025-03-03\531e97c2-5b90-83e0-4927-2901cd970b99.pdf'
Detected moved/renamed file: C:\Users\lawre/Downloads\47f80092-3211-cf23-17ba-8aa6c518caef.pdf
Moved '47f80092-3211-cf23-17ba-8aa6c518caef.pdf' to 'C:\Users\lawre/OneDrive/Documents/Competitor_filings_monitor/Filings/Initial\2025-03-03\47f80092-3211-cf23-17ba-8aa6c518caef.pdf'
Detected moved/renamed file: C:\Users\lawre/Downloads\2496e036-66a6-15c0-510e-9c4729590643.pdf
Moved '2496e036-66a6-15c0-510e-9c4729590643.pdf' to 'C:\Users\lawre/OneDrive/Documents/Competitor_filings_monitor/Filings/Initial\2025-03-03\2496e036-66a6-15c0-510e-9c4729590643.pdf'
Detected moved/renamed file: C:\Users\lawre/Downloads\dfe429

In [3]:
## Code to move and rename filings from the "initial" folder to the "final" folder ##

import os
import glob
import re
import shutil
import PyPDF2

def extract_fields_from_pdf(pdf_path):
    """
    Opens the PDF file and extracts key fields from the first page:
      - SERFF Tracking number (the first token after 'SERFF Tracking #:')
      - State (the text after "State:" until the newline, then mapped to its 2-letter abbreviation)
      - Filing Company (the text after "Filing Company:" until the newline)
    Returns a tuple: (serff, state, filing_company)
    """
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        first_page_text = reader.pages[0].extract_text()

    # Define regular expressions to capture the values.
    serff_regex = r"SERFF Tracking #:\s*([^\n]+)"
    state_regex = r"State:\s*([^\n]+)"
    filing_company_regex = r"Filing Company:\s*([^\n]+)"

    serff_match = re.search(serff_regex, first_page_text)
    state_match = re.search(state_regex, first_page_text)
    filing_company_match = re.search(filing_company_regex, first_page_text)

    serff = serff_match.group(1).strip() if serff_match else None
    state = state_match.group(1).strip() if state_match else None
    filing_company = filing_company_match.group(1).strip() if filing_company_match else None

    # Clean up the SERFF value: take only the first token
    if serff:
        serff = serff.split()[0]

    # Update the mapping dictionary to explicitly include two-word states.
    state_mapping = {
        "Alabama": "AL",
        "Alaska": "AK",
        "Arizona": "AZ",
        "Arkansas": "AR",
        "California": "CA",
        "Colorado": "CO",
        "Connecticut": "CT",
        "Delaware": "DE",
        "Florida": "FL",
        "Georgia": "GA",
        "Hawaii": "HI",
        "Idaho": "ID",
        "Illinois": "IL",
        "Indiana": "IN",
        "Iowa": "IA",
        "Kansas": "KS",
        "Kentucky": "KY",
        "Louisiana": "LA",
        "Maine": "ME",
        "Maryland": "MD",
        "Massachusetts": "MA",
        "Michigan": "MI",
        "Minnesota": "MN",
        "Mississippi": "MS",
        "Missouri": "MO",
        "Montana": "MT",
        "Nebraska": "NE",
        "Nevada": "NV",
        "New Hampshire": "NH",
        "New Jersey": "NJ",
        "New Mexico": "NM",
        "New York": "NY",
        "North Carolina": "NC",
        "North Dakota": "ND",
        "Ohio": "OH",
        "Oklahoma": "OK",
        "Oregon": "OR",
        "Pennsylvania": "PA",
        "Rhode Island": "RI",
        "South Carolina": "SC",
        "South Dakota": "SD",
        "Tennessee": "TN",
        "Texas": "TX",
        "Utah": "UT",
        "Vermont": "VT",
        "Virginia": "VA",
        "Washington": "WA",
        "West Virginia": "WV",
        "Wisconsin": "WI",
        "Wyoming": "WY"
    }
    
    # Process the state string: if it begins with a known two-word state, use both words.
    if state:
        tokens = state.split()
        if len(tokens) >= 2:
            candidate = " ".join(tokens[:2])
            two_word_states = {"New York", "New Jersey", "New Hampshire", "New Mexico",
                               "South Carolina",  "South Dakota" ,"North Carolina", "North Dakota", "Rhode Island", "West Virginia"}
            if candidate in two_word_states:
                state_full = candidate
            else:
                state_full = tokens[0]
        else:
            state_full = state.strip()
        state = state_mapping.get(state_full, state_full)

    return serff, state, filing_company

def process_filings_for_week(week_date):
    """
    Given a week_date string (e.g., '2025-02-03'), this function:
      1. Sets the source folder as './Filings/Initial/<week_date>'
      2. Creates a destination folder './Filings/Final/<week_date>' if it doesn't exist.
      3. Loops through each PDF file in the source folder, ordered descending by modification time.
      4. For each PDF, extracts the SERFF Tracking number and State.
      5. Copies the file to the destination folder, renaming it to "State_SERFF#.pdf".
         (If a file for that SERFF already exists, it is replaced with the new one.)
    """
    # Define folder paths
    initial_folder = os.path.join("Filings", "Initial", week_date)
    final_folder = os.path.join("Filings", "Final", week_date)

    # Create the destination folder if it doesn't exist
    os.makedirs(final_folder, exist_ok=True)

    # Get a list of all PDF files in the initial folder
    pdf_files = glob.glob(os.path.join(initial_folder, "*.pdf"))
    
    # Sort the list in descending order by modified time
    pdf_files.sort(key=lambda f: os.path.getmtime(f), reverse=True)

    for pdf_file in pdf_files:
        serff, state, filing_company = extract_fields_from_pdf(pdf_file)
        if serff is None:
            print(f"Warning: Could not extract SERFF Tracking number from {pdf_file}. Skipping.")
            continue

        # Create the new filename as "State_SERFF#.pdf". If state isn't available, use only SERFF.
        if state:
            new_filename = f"{state}_{serff}.pdf"
        else:
            new_filename = f"{serff}.pdf"

        dest_file = os.path.join(final_folder, new_filename)
        if os.path.exists(dest_file):
            print(f"Replacing duplicate SERFF {serff} in {pdf_file}")
        else:
            print(f"Copying {pdf_file} to {dest_file}")

        # Copy the file to the destination (this overwrites if the file exists)
        shutil.copy2(pdf_file, dest_file)

# Example usage:
# process_filings_for_week("2025-02-03")


In [4]:
process_filings_for_week("2025-03-03")


Copying Filings\Initial\2025-03-03\28497323-9289-fcc2-a19a-d839ddfc1598.pdf to Filings\Final\2025-03-03\VA_HART-134429350.pdf
Copying Filings\Initial\2025-03-03\137a2e60-c738-1e3d-7abb-cff9cf522ee2.pdf to Filings\Final\2025-03-03\TN_USAA-134436312.pdf
Copying Filings\Initial\2025-03-03\fa0533a1-8a5c-b1b6-6e1e-0dfb47a3bc14.pdf to Filings\Final\2025-03-03\TN_LBPM-134434596.pdf
Copying Filings\Initial\2025-03-03\645ff4b1-8bf6-a696-5fb6-225a29603acc.pdf to Filings\Final\2025-03-03\CT_TRVD-G134412788.pdf
Copying Filings\Initial\2025-03-03\fbd0b6e1-8c52-5802-1d69-b515bea23802.pdf to Filings\Final\2025-03-03\TX_GMMX-134436826.pdf
Copying Filings\Initial\2025-03-03\47f80092-3211-cf23-17ba-8aa6c518caef_1.pdf to Filings\Final\2025-03-03\AL_LBPM-134436753.pdf
Copying Filings\Initial\2025-03-03\531e97c2-5b90-83e0-4927-2901cd970b99_1.pdf to Filings\Final\2025-03-03\NE_TRVD-G134437855.pdf
Copying Filings\Initial\2025-03-03\895a49cf-60a0-9610-9166-132b2f30a0d6.pdf to Filings\Final\2025-03-03\MI_LBPM-

In [3]:
import os
os.getcwd()

'C:\\Users\\lawre\\OneDrive\\Documents\\Competitor_filings_monitor'