In [143]:
!pip install pytesseract pdf2image pandas Pillow
!pip install opencv-python



In [141]:
import pytesseract
from pdf2image import convert_from_path
import os
from PIL import Image
import pandas as pd

def extract_text_from_pdf(pdf_path):
    """
    Extract both printed and handwritten text from PDF pages.
    Returns a structured dataset.
    """
    # Convert PDF to images
    images = convert_from_path(pdf_path)
    
    # Create temporary directory for image processing
    if not os.path.exists('temp'):
        os.makedirs('temp')
    
    database = []
    
    # Process each page
    for page_num, image in enumerate(images, 1):
        # Save the image temporarily
        image_path = f'temp/page_{page_num}.png'
        image.save(image_path, 'PNG')
        
        # Configure Tesseract for different text types
        # For printed text
        printed_config = r'--oem 3 --psm 6'
        printed_text = pytesseract.image_to_string(image_path, config=printed_config)
        
        # For handwritten text
        handwritten_config = r'--oem 3 --psm 6 -l eng --psm 12'
        handwritten_text = pytesseract.image_to_string(image_path, config=handwritten_config)
        
        # Clean up extracted text
        printed_text = clean_text(printed_text)
        handwritten_text = clean_text(handwritten_text)
        
        # Store in database
        database.append({
            'page_number': page_num,
            'printed_text': printed_text,
            'handwritten_text': handwritten_text
        })
        
        # Clean up temporary file
        os.remove(image_path)
    
    # Remove temporary directory
    os.rmdir('temp')
    
    return database

def clean_text(text):
    """
    Clean and normalize extracted text
    """
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Remove special characters but keep basic punctuation
    text = ''.join(char for char in text if char.isalnum() or char in ' .,()-/')
    return text.strip()

def save_to_database(data, output_path='construction_text_database.csv'):
    """
    Save extracted text to CSV database
    """
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)
    return df

def main():
    pdf_path = 'extracted.pdf'  # Update with your PDF path
    
    try:
        # Extract text
        print("Extracting text from PDF...")
        text_data = extract_text_from_pdf(pdf_path)
        
        # Save to database
        print("Saving to database...")
        df = save_to_database(text_data)
        
        print(f"Successfully processed {len(text_data)} pages")
        print("Sample of extracted data:")
        print(df.head())
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Extracting text from PDF...
Saving to database...
Successfully processed 11 pages
Sample of extracted data:
   page_number                                       printed_text  \
0            1  1 GENERAL 7  ) aS   6    F SSSSS Ee SEELSSSEE ...   
1            2  Built Environment Environment  surroundings wh...   
2            3  Built Environment . .  gales cold Environmenta...   
3            4  Built Environment Physical considerations 1. N...   
4            5  The StructureBasic Types for design purposes f...   

                                    handwritten_text  
0  1 GENERAL (1  sr JX  ove aN EG  C fo2i r   BUI...  
1  Built Environment man-made or a Environment  s...  
2  Built Environment cold Environmental Considera...  
3  Built Environment Physical considerations Natu...  
4  The StructureBasic Types for design purposes f...  


In [None]:
!pip install PyMuPDF

In [147]:
#Python script that will extract both text and images separately:
import pytesseract
from pdf2image import convert_from_path
import os
from PIL import Image
import pandas as pd
import cv2
import numpy as np
import fitz  # PyMuPDF library for PDF image extraction

def extract_images_from_pdf(pdf_path, output_dir="extracted_images"):
    """
    Extract images from PDF using PyMuPDF
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Open PDF
    pdf_document = fitz.open(pdf_path)
    image_list = []
    
    # Iterate through pages
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        image_list = page.get_images()
        
        # Process each image on the page
        for img_index, img in enumerate(image_list):
            # Get image data
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Save image
            image_filename = f"{output_dir}/page{page_num+1}_image{img_index+1}.{image_ext}"
            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
            
    pdf_document.close()
    return len(image_list)

def extract_text_and_images(pdf_path):
    """
    Extract both text and images from PDF
    """
    # Extract images first
    print("Extracting images...")
    num_images = extract_images_from_pdf(pdf_path)
    print(f"Extracted {num_images} images")
    
    # Extract text (using your existing functions)
    print("Extracting text...")
    text_data = extract_text_from_pdf(pdf_path)
    
    return text_data

def main():
    pdf_path = 'extracted.pdf'
    
    try:
        # Extract both text and images
        text_data = extract_text_and_images(pdf_path)
        
        # Save text to database
        print("Saving to database...")
        df = save_to_database(text_data)
        
        print(f"Successfully processed {len(text_data)} pages")
        print("\nSample of extracted data:")
        for index, row in df.iterrows():
            print(f"\nPage {row['page_number']}:")
            print("Printed text:", row['printed_text'][:100], "...")
            if row['handwritten_text']:
                print("Handwritten text:", row['handwritten_text'])
        
        print("\nImages have been saved to the 'extracted_images' directory")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Extracting images...
Extracted 1 images
Extracting text...
Saving to database...
Successfully processed 11 pages

Sample of extracted data:

Page 1:
Printed text: 1 GENERAL 7  ) aS   6    F SSSSS Ee SEELSSSEE F BUILT ENVIRONMENT THE STRUCTURE PRIMARY AND SECONDAR ...
Handwritten text: 1 GENERAL (1  sr JX  ove aN EG  C fo2i r   BUILT ENVIRONMENT THE STRUCTURE PRIMARY AND SECONDARY ELEMENTS CONSTRUCTION ACTIVITIES CONSTRUCTION DOCUMENTS CONSTRUCTION DRAWINGS BUILDING SURVEY CDM REGULATIONS SAFETY SIGNS AND SYMBOLS PLANNING APPLICATION MODULAR COORDINATION CONSTRUCTION REGULATIONS BUILDING REGULATIONS BRITISH STANDARDS EUROPEAN STANDARDS CPI SYSTEM OF CODING CI/SFB SYSTEM OF CODING

Page 2:
Printed text: Built Environment Environment  surroundings which can be natural, man-made or a combination of these ...
Handwritten text: Built Environment man-made or a Environment  surroundings which can be natural combination of these Built Environment  created by man with or without the aid of the n

In [177]:
!pip install opencv-python
!pip install pdf2image
!pip install pytesseract
!pip install pandas
!pip install numpy
!pip install Pillow



In [183]:
!pip install opencv-python pdf2image pytesseract pandas numpy Pillow



In [194]:
import cv2
import numpy as np
import pdf2image
import pytesseract
import pandas as pd
import os
from pathlib import Path

class BlueTextExtractor:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.annotations = []
        
        # Create debug directory
        self.debug_dir = "debug_output"
        os.makedirs(self.debug_dir, exist_ok=True)

    def extract_blue_text(self, image):
        """Extract blue handwritten text using multiple color spaces"""
        # Convert to multiple color spaces for better detection
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
        
        # HSV blue range
        lower_blue_hsv = np.array([100, 50, 50])
        upper_blue_hsv = np.array([130, 255, 255])
        mask_hsv = cv2.inRange(hsv, lower_blue_hsv, upper_blue_hsv)
        
        # LAB blue range (b channel)
        _, _, b = cv2.split(lab)
        mask_lab = cv2.threshold(b, 145, 255, cv2.THRESH_BINARY)[1]
        
        # Combine masks
        blue_mask = cv2.bitwise_or(mask_hsv, mask_lab)
        
        # Clean up mask
        kernel = np.ones((2,2), np.uint8)
        blue_mask = cv2.morphologyEx(blue_mask, cv2.MORPH_OPEN, kernel)
        blue_mask = cv2.morphologyEx(blue_mask, cv2.MORPH_CLOSE, kernel)
        
        return blue_mask

    def save_debug_image(self, image, name, page_num, region_num=None):
        """Save image for debugging"""
        if region_num is not None:
            filename = f"{name}_page{page_num}_region{region_num}.png"
        else:
            filename = f"{name}_page{page_num}.png"
        path = os.path.join(self.debug_dir, filename)
        cv2.imwrite(path, image)

    def process_region(self, image, region, page_num, region_num):
        """Process and save a single region"""
        x, y, w, h = region
        padding = 10
        x = max(0, x - padding)
        y = max(0, y - padding)
        w = min(image.shape[1] - x, w + 2*padding)
        h = min(image.shape[0] - y, h + 2*padding)
        
        roi = image[y:y+h, x:x+w]
        
        # Save original region
        self.save_debug_image(roi, "original", page_num, region_num)
        
        # Convert to grayscale
        gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
        self.save_debug_image(gray, "gray", page_num, region_num)
        
        # Enhance contrast
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(gray)
        self.save_debug_image(enhanced, "enhanced", page_num, region_num)
        
        # Multiple threshold attempts
        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        self.save_debug_image(binary, "binary", page_num, region_num)
        
        # Save region info
        self.annotations.append({
            'Page': page_num,
            'Region': region_num,
            'X': x,
            'Y': y,
            'Width': w,
            'Height': h
        })
        
        return roi

    def process_pdf(self):
        """Process PDF and save debug images"""
        try:
            pages = pdf2image.convert_from_path(self.pdf_path)
            
            for page_num, page in enumerate(pages, 1):
                print(f"Processing page {page_num}")
                
                # Convert to OpenCV format
                image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
                
                # Get blue mask
                blue_mask = self.extract_blue_text(image)
                self.save_debug_image(blue_mask, "mask", page_num)
                
                # Find contours
                contours, _ = cv2.findContours(blue_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                
                # Draw contours on original image for visualization
                debug_image = image.copy()
                cv2.drawContours(debug_image, contours, -1, (0, 255, 0), 2)
                self.save_debug_image(debug_image, "contours", page_num)
                
                # Process each region
                for i, contour in enumerate(contours):
                    if cv2.contourArea(contour) > 50:  # Minimum area threshold
                        region = cv2.boundingRect(contour)
                        self.process_region(image, region, page_num, i)
            
            return True
            
        except Exception as e:
            print(f"Error processing PDF: {str(e)}")
            return False

    def save_results(self, output_path='detected_regions.xlsx'):
        """Save region information to Excel"""
        if not self.annotations:
            print("No regions found")
            return
            
        df = pd.DataFrame(self.annotations)
        df = df.sort_values(['Page', 'Y'])
        df.to_excel(output_path, index=False)
        print(f"Results saved to {output_path}")
        print("\nPlease check the 'debug_output' folder for visualization of detected regions")

def main():
    pdf_path = "extracted.pdf"  # Replace with your PDF path
    extractor = BlueTextExtractor(pdf_path)
    
    if extractor.process_pdf():
        extractor.save_results()
    else:
        print("Failed to process PDF")

if __name__ == "__main__":
    main()

Processing page 1
Processing page 2
Processing page 3
Processing page 4
Processing page 5
Processing page 6
Processing page 7
Processing page 8
Processing page 9
Processing page 10
Processing page 11
Results saved to detected_regions.xlsx

Please check the 'debug_output' folder for visualization of detected regions
