<a href="https://colab.research.google.com/github/Info-stats-ai/CAFB/blob/main/The_Builder_Market.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Mount Google Drive
from google.colab import drive

# This will prompt you for authorization.
# Click the link, sign in to your Google account, copy the authorization code,
# and paste it back into the box in this cell.
drive.mount('/content/drive')

print("\n✅ Google Drive successfully mounted!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

✅ Google Drive successfully mounted!


In [2]:
# Install necessary libraries first
!pip install pdfplumber -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import zipfile

In [4]:
# --- CONFIGURATION ---
# This path MUST point to the folder containing your UPLOADED ZIP FILES.
# Please double-check that this is the correct path.
pdf_zip_folder_path = "/content/drive/My Drive/homeguide_zipped" # <--- UPDATE THIS PATH if needed
# -------------------

# These are the temporary folders we will create in the Colab environment.
unzipped_pdfs_folder = "unzipped_pdfs"
text_output_folder = "extracted_text"

# Create the folders to work in.
os.makedirs(unzipped_pdfs_folder, exist_ok=True)
os.makedirs(text_output_folder, exist_ok=True)

print("\n✅ Drive mounted and folders are set up successfully.")
print(f"I will look for zip files in: '{pdf_zip_folder_path}'")


✅ Drive mounted and folders are set up successfully.
I will look for zip files in: '/content/drive/My Drive/homeguide_zipped'


In [5]:
print(f"--- Starting Step 3: Unzipping files from '{pdf_zip_folder_path}' ---")

if not os.path.exists(pdf_zip_folder_path):
    print(f"❌ ERROR: The folder '{pdf_zip_folder_path}' was not found.")
else:
    zip_files = [f for f in os.listdir(pdf_zip_folder_path) if f.lower().endswith(".zip")]
    if not zip_files:
        print(f"❌ ERROR: No .zip files were found in '{pdf_zip_folder_path}'.")
    else:
        print(f"Found {len(zip_files)} zip files to process...")
        for zip_filename in zip_files:
            print(f"   Unzipping '{zip_filename}'...")
            zip_path = os.path.join(pdf_zip_folder_path, zip_filename)
            try:
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(unzipped_pdfs_folder)
            except Exception as e:
                print(f"   ⚠️ Could not unzip '{zip_filename}'. Error: {e}")

        # Verify the result by counting the number of items in the output folder.
        # Note: This counts files AND any sub-folders that were created.
        total_items = len(os.listdir(unzipped_pdfs_folder))
        print(f"\n✅ Unzipping complete! The '{unzipped_pdfs_folder}' folder now contains {total_items} items.")

--- Starting Step 3: Unzipping files from '/content/drive/My Drive/homeguide_zipped' ---
Found 3 zip files to process...
   Unzipping 'Homeguide.zip'...
   Unzipping 'Homeguide-20250902T030504Z-1-002.zip'...
   Unzipping 'Homeguide-20250902T030504Z-1-001.zip'...

✅ Unzipping complete! The 'unzipped_pdfs' folder now contains 1 items.


In [6]:
# Step 4: Find All PDFs Recursively and Rename Them

import os
import json

# This is the top-level folder we need to search inside.
unzipped_pdfs_folder = "unzipped_pdfs"

print("\n--- Starting Step 4: Finding and renaming all PDFs ---")

# This list will store the full path to every PDF we find.
all_pdf_paths = []

# os.walk is a powerful function that goes through a directory and all its subdirectories.
# This is what makes our script robust to the nested folder structure.
for root, dirs, files in os.walk(unzipped_pdfs_folder):
    for file in files:
        if file.lower().endswith(".pdf"):
            # We save the full, correct path to each PDF.
            all_pdf_paths.append(os.path.join(root, file))

total_files_found = len(all_pdf_paths)
if total_files_found == 0:
    print("❌ ERROR: No PDF files were found even after searching all subfolders.")
    print("Please check the 'unzipped_pdfs' folder to see if the files are there.")
else:
    print(f"✅ Success! Found {total_files_found} PDF files inside the subfolders.")
    print("Proceeding to rename...")

    name_mapping = {}
    # This list will store the NEW paths of the renamed files for the next step.
    renamed_files_paths = []

    # Now, we loop through the full paths we found and rename each file.
    for i, old_path in enumerate(all_pdf_paths):
        try:
            # The new path will be in the same directory as the old one.
            new_path = os.path.join(os.path.dirname(old_path), f"{i:04d}.pdf")

            # We map the new simple name back to the original full path for our records.
            name_mapping[os.path.basename(new_path)] = old_path
            os.rename(old_path, new_path)
            renamed_files_paths.append(new_path)
        except Exception as e:
            print(f"   ⚠️ Could not rename '{os.path.basename(old_path)}'. Error: {e}")

    # Save our mapping key for future reference.
    with open('filename_mapping.json', 'w') as f:
        json.dump(name_mapping, f, indent=4)

    print(f"\n✅ Renaming complete! {len(name_mapping)} files are now in a simple format.")


--- Starting Step 4: Finding and renaming all PDFs ---
✅ Success! Found 900 PDF files inside the subfolders.
Proceeding to rename...

✅ Renaming complete! 900 files are now in a simple format.


In [None]:
# Step 5: Extract Text Using pdfplumber

import pdfplumber
import os

# These are the folders we are working with
text_output_folder = "extracted_text"

print("\n--- Starting Step 5: Extracting text with the upgraded pdfplumber library ---")

# This script relies on the 'renamed_files_paths' list created in the previous cell.
# This check ensures we have the list before we start.
if 'renamed_files_paths' not in locals() or not renamed_files_paths:
     print("❌ ERROR: Could not find the list of renamed files.")
     print("Please re-run the previous cell (Step 4) to generate the file list.")
else:
    total_files_to_extract = len(renamed_files_paths)
    print(f"Found {total_files_to_extract} cleaned PDF files to process...")

    # Loop through our list of clean, renamed PDF paths
    for i, pdf_path in enumerate(renamed_files_paths):
        # Get the simple filename (e.g., "0001.pdf") for progress messages
        pdf_filename = os.path.basename(pdf_path)

        # Print a progress update every 50 files to show it's working
        if (i + 1) % 50 == 0 or i == 0:
            print(f"Processing file {i+1}/{total_files_to_extract}: {pdf_filename}...")

        try:
            full_text = ""
            # Open the PDF with pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                # Loop through each page in the PDF
                for page in pdf.pages:
                    # .extract_text() is excellent at preserving the layout of tables.
                    # x_tolerance=2 helps in aligning text that is slightly offset.
                    text = page.extract_text(x_tolerance=2)
                    if text:
                        full_text += text + "\n\n--- Page Break ---\n\n"

            # Create the output filename (e.g., "0001.txt")
            text_filename = pdf_filename.replace('.pdf', '.txt')
            output_path = os.path.join(text_output_folder, text_filename)

            # Save the extracted text to the new file
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(full_text)
        except Exception as e:
            # If one PDF is corrupted or unreadable, this will log the error and continue
            print(f"   ⚠️ Could not process {pdf_filename}. It may be corrupted or empty. Error: {e}")

    print(f"\n✅✅✅ Data Preparation Complete! ✅✅✅")
    print(f"High-quality text from all files has been extracted to the '{text_output_folder}' folder.")


--- Starting Step 5: Extracting text with the upgraded pdfplumber library ---
Found 900 cleaned PDF files to process...
Processing file 1/900: 0000.pdf...




Processing file 50/900: 0049.pdf...




Processing file 100/900: 0099.pdf...
Processing file 150/900: 0149.pdf...




Processing file 200/900: 0199.pdf...




Processing file 250/900: 0249.pdf...




Processing file 300/900: 0299.pdf...




Processing file 350/900: 0349.pdf...
Processing file 400/900: 0399.pdf...
Processing file 450/900: 0449.pdf...




Processing file 500/900: 0499.pdf...




Processing file 550/900: 0549.pdf...
Processing file 600/900: 0599.pdf...
