In [None]:
!apt-get install -y poppler-utils
!pip install PyPDF2 pdf2image opencv-python

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [None]:
# Required Libraries
import PyPDF2
import re
import pdf2image
import cv2
import numpy as np
import pandas as pd

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        texts = [pdf_reader.pages[page_num].extract_text() for page_num in range(len(pdf_reader.pages))]
        print (texts)
    return texts


# Extract area and region names from the text
# def extract_area_data(texts):
#     pattern = r"(\w+ AREA):[ \n]+(\d+ sq\.ft\.)"
#     area_data = []
#     for text in texts:
#         matches = re.findall(pattern, text)
#         area_dict = {region: area for region, area in matches}
#         area_data.append(area_dict)
#     return area_data

def extract_area_data(texts):
    pattern = r"(\w+ AREA):[ \n]+(\d+ sq\.ft\.)"
    area_data = []
    for text in texts:
        matches = re.findall(pattern, text)
        area_dict = {region: area for region, area in matches}

        # Rename the key for the suite area
        for key in list(area_dict.keys()):
            if 'SUITE AREA' in key:
                suite_name = key.split('SUITE AREA')[0].strip()
                area_dict['Suite Name'] = suite_name
                area_dict['Suite Area'] = area_dict[key]
                del area_dict[key]

        area_data.append(area_dict)
    return area_data


def extract_room_details(text):
    # Search for the pattern like 'LIVING/DININGROOM13’0”(10’9”) x 23’3”LIN.'
    pattern = r'([A-Z/]+)(\d+’\d+”(?:\(\d+’\d+”\))? x \d+’\d+”)'
    matches = re.findall(pattern, text, re.IGNORECASE)

    room_details = {}
    for match in matches:
        room, dimension = match
        room_details[room] = dimension

    return room_details

def extract_floor_number(text):
    # Search for the patterns like '3rd - 9th Floor' or '3rd Floor'
    pattern = r'(\d+(?:st|nd|rd|th)) -? (\d+(?:st|nd|rd|th))? Floor'
    match = re.search(pattern, text, re.IGNORECASE)

    # Return the matched floors or None if not found
    if match:
        start_floor = match.group(1)
        end_floor = match.group(2) if match.group(2) else start_floor
        return f"{start_floor} - {end_floor}" if start_floor != end_floor else start_floor
    return None

def extract_bedroom_type(text):
    # Split the text by the identifier '\nAll areas'
    segments = text.split('\nAll areas')

    # The bedroom type should be the segment before the identifier
    if len(segments) > 1:
        return segments[0].strip()
    return None


# Convert PDF to images
def pdf_to_images(pdf_path):
    return pdf2image.convert_from_path(pdf_path)

# Detect north direction using edge-based template matching
def detect_north_direction(image, template):
    img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
    img_edges = cv2.Canny(img_gray, 50, 150)
    template_edges = cv2.Canny(template_gray, 50, 150)
    res = cv2.matchTemplate(img_edges, template_edges, cv2.TM_CCOEFF_NORMED)
    min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
    if template.shape[0] > template.shape[1]:
        direction = "North/South"
    else:
        direction = "East/West"
    return direction

# Determine front gate orientation based on north direction
def get_front_gate_orientation(north_direction):
    orientations = {
        "North/South": "South",
        "South/North": "North",
        "East/West": "East",
        "West/East": "West"
    }
    return orientations.get(north_direction, "Unknown")

# Main code
pdf_path = "/content/sample_data/One-Cole-Floorplans.pdf"
template_path = "/content/sample_data/indicator.png"

# Extract data
texts = extract_text_from_pdf(pdf_path)
area_data = extract_area_data(texts)
images = pdf_to_images(pdf_path)
template = cv2.imread(template_path)
floor_numbers = [extract_floor_number(text) for text in texts]
room_data = [extract_room_details(text) for text in texts]
df_room = pd.DataFrame(room_data)
bedroom_types = [extract_bedroom_type(text) for text in texts]


# Determine north directions and front gate orientations
north_directions = [detect_north_direction(cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR), template) for img in images]
front_gate_orientations = [get_front_gate_orientation(direction) for direction in north_directions]

# Create and save DataFrame
df = pd.DataFrame(area_data)
df['Type of Bedroom'] = bedroom_types
df['Floor Number'] = floor_numbers
df = pd.concat([df, df_room], axis=1)

df['North Direction'] = north_directions
df['Front Gate Orientation'] = front_gate_orientations
df['Page Number'] = range(1, len(df) + 1)
df.to_excel("output_data.xlsx", index=False)

['Studio + media \nAll areas and stated room dimensions are approximate. Floor area measured in accordance with Tarion Warranty Corporation bulletin #22. Actual living area will vary from the stated floor area. The purchaser acknowledges that the actual unit purchased may be a reverse layout to the plan shown. Illustration is artist concept. E. & O.E.TheE-06WilkinsSUITE AREA:          442 sq.ft.OUTDOOR AREA:    77 sq.ft.TOTAL AREA:          519 sq.ft.East\n3rd - 9th FloorBALCONY\nLIVING/DININGROOM13’0”(10’9”) x 23’3”LIN.\nW/DD/W\nENTRYMEDIAPRIVACY SCREEN\nPRIVACY SCREEN\nBATH', 'One bedroom\nAll areas and stated room dimensions are approximate. Floor area measured in accordance with Tarion Warranty Corporation bulletin #22. Actual living area will vary from the stated floor area. The purchaser acknowledges that the actual unit purchased may be a reverse layout to the plan shown. Illustration is artist concept. E. & O.E.TheW-01RegencySUITE AREA:            576 sq.ft.OUTDOOR AREA:    119

  06WilkinsSUITE AREA OUTDOOR AREA  TOTAL AREA 01RegencySUITE AREA  \
0          442 sq.ft.    77 sq.ft.  519 sq.ft.                 NaN   
1                 NaN   119 sq.ft.  695 sq.ft.          576 sq.ft.   
2                 NaN   138 sq.ft.  747 sq.ft.                 NaN   
3                 NaN   119 sq.ft.  767 sq.ft.                 NaN   
4                 NaN   138 sq.ft.  842 sq.ft.                 NaN   
5                 NaN    66 sq.ft.  895 sq.ft.                 NaN   

  12HillcrestSUITE AREA 03TrinitySUITE AREA 19DovercourtSUITE AREA  \
0                   NaN                 NaN                    NaN   
1                   NaN                 NaN                    NaN   
2            609 sq.ft.                 NaN                    NaN   
3                   NaN          648 sq.ft.                    NaN   
4                   NaN                 NaN             704 sq.ft.   
5                   NaN                 NaN                    NaN   

  12RegentSUITE AR

In [None]:

print(df)

  06WilkinsSUITE AREA OUTDOOR AREA  TOTAL AREA 01RegencySUITE AREA  \
0          442 sq.ft.    77 sq.ft.  519 sq.ft.                 NaN   
1                 NaN   119 sq.ft.  695 sq.ft.          576 sq.ft.   
2                 NaN   138 sq.ft.  747 sq.ft.                 NaN   
3                 NaN   119 sq.ft.  767 sq.ft.                 NaN   
4                 NaN   138 sq.ft.  842 sq.ft.                 NaN   
5                 NaN    66 sq.ft.  895 sq.ft.                 NaN   

  12HillcrestSUITE AREA 03TrinitySUITE AREA 19DovercourtSUITE AREA  \
0                   NaN                 NaN                    NaN   
1                   NaN                 NaN                    NaN   
2            609 sq.ft.                 NaN                    NaN   
3                   NaN          648 sq.ft.                    NaN   
4                   NaN                 NaN             704 sq.ft.   
5                   NaN                 NaN                    NaN   

  12RegentSUITE AR

In [None]:

print(df)

  06WilkinsSUITE AREA OUTDOOR AREA  TOTAL AREA 01RegencySUITE AREA  \
0          442 sq.ft.    77 sq.ft.  519 sq.ft.                 NaN   
1                 NaN   119 sq.ft.  695 sq.ft.          576 sq.ft.   
2                 NaN   138 sq.ft.  747 sq.ft.                 NaN   
3                 NaN   119 sq.ft.  767 sq.ft.                 NaN   
4                 NaN   138 sq.ft.  842 sq.ft.                 NaN   
5                 NaN    66 sq.ft.  895 sq.ft.                 NaN   

  12HillcrestSUITE AREA 03TrinitySUITE AREA 19DovercourtSUITE AREA  \
0                   NaN                 NaN                    NaN   
1                   NaN                 NaN                    NaN   
2            609 sq.ft.                 NaN                    NaN   
3                   NaN          648 sq.ft.                    NaN   
4                   NaN                 NaN             704 sq.ft.   
5                   NaN                 NaN                    NaN   

  12RegentSUITE AR

In [None]:
print(area_data)

[{'OUTDOOR AREA': '77 sq.ft.', 'TOTAL AREA': '519 sq.ft.', 'Suite Name': '06Wilkins', 'Suite Area': '442 sq.ft.'}, {'OUTDOOR AREA': '119 sq.ft.', 'TOTAL AREA': '695 sq.ft.', 'Suite Name': '01Regency', 'Suite Area': '576 sq.ft.'}, {'OUTDOOR AREA': '138 sq.ft.', 'TOTAL AREA': '747 sq.ft.', 'Suite Name': '12Hillcrest', 'Suite Area': '609 sq.ft.'}, {'OUTDOOR AREA': '119 sq.ft.', 'TOTAL AREA': '767 sq.ft.', 'Suite Name': '03Trinity', 'Suite Area': '648 sq.ft.'}, {'OUTDOOR AREA': '138 sq.ft.', 'TOTAL AREA': '842 sq.ft.', 'Suite Name': '19Dovercourt', 'Suite Area': '704 sq.ft.'}, {'OUTDOOR AREA': '66 sq.ft.', 'TOTAL AREA': '895 sq.ft.', 'Suite Name': '12Regent', 'Suite Area': '829 sq.ft.'}]
