In [3]:
!pip install --upgrade google-cloud-vision




In [1]:
!apt-get install -y tesseract-ocr
!pip install pytesseract opencv-python-headless pillow


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Using cached pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [6]:
import os
import re
import json
import io
from google.cloud import vision
from google.oauth2 import service_account
from PIL import Image

# Authenticate to Google Cloud Vision API
KEY_PATH = "/content/glassy-azimuth-463621-t4-b9f25b9f7b00.json"
credentials = service_account.Credentials.from_service_account_file(KEY_PATH)
client = vision.ImageAnnotatorClient(credentials=credentials)

def process_image(image_path):
    # Run OCR on the image using Google Vision
    with io.open(image_path, "rb") as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    return response.full_text_annotation

def extract_blocks(document):
    # Extract text blocks and bounding box coordinates
    blocks = []
    for page in document.pages:
        for block in page.blocks:
            x0 = min(v.x for v in block.bounding_box.vertices)
            x1 = max(v.x for v in block.bounding_box.vertices)
            y0 = min(v.y for v in block.bounding_box.vertices)
            y1 = max(v.y for v in block.bounding_box.vertices)
            text = ""
            for para in block.paragraphs:
                for word in para.words:
                    text += "".join([s.text for s in word.symbols]) + " "
            blocks.append({"text": text.strip(), "x0": x0, "x1": x1, "y0": y0, "y1": y1})
    return blocks

def stitch_blocks(blocks):
    # Combine text from blocks by vertical position
    blocks_sorted = sorted(blocks, key=lambda b: (b["y0"], b["x0"]))
    return "\n".join([b["text"] for b in blocks_sorted])

def clean_text(text):
    # Normalize whitespace and remove trailing punctuation
    if not text:
        return None
    text = text.replace('\u2800', ' ').replace('\xad', '')
    text = re.sub(r'[-–]+\s+', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'[.,;]*$', '', text)
    return text.strip()

all_residents = []
all_deceased = []

image_files = [f"{n}.jpeg" for n in range(104, 109)]

for image_file in image_files:
    document = process_image(image_file)
    blocks = extract_blocks(document)

    # Split into left/right columns
    column_threshold = 1100
    left_column_blocks = [b for b in blocks if b["x0"] < column_threshold]
    right_column_blocks = [b for b in blocks if b["x0"] >= column_threshold]
    stitched_text = stitch_blocks(left_column_blocks) + "\n" + stitch_blocks(right_column_blocks)

    entries_raw = re.split(r'(?<=\.)\s+(?=[A-Z"“])', stitched_text)

    resident_entries = []
    dead_entries = []
    prev_last_name = None
    prev_address = {"Raw": None, "Indicator": None}
    ad_keywords = ['mortgage', 'repairing', 'store', 'per cent', 'trunks', 'wagons', 'tel']
    honorifics = {"mr", "mrs", "miss", "ms", "dr"}
    page_number = int(re.search(r'\d+', image_file).group())
    directory_name = "Minneapolis 1900"

    for entry in entries_raw:
        entry = entry.strip().replace('“', '"').replace('”', '"')
        if not entry or any(k in entry.lower() for k in ad_keywords):
            continue

        # Match deceased entries with "died" and extract name/date/age
        if "died" in entry.lower():
            death_match = re.match(r'^(")?\s*([^\n,]+),\s*died\s+(.*?)(?:,|\s)age[\s:]*([0-9]{1,3})', entry, re.IGNORECASE)
            if death_match:
                quoted = death_match.group(1) == '"'
                raw_name = death_match.group(2).strip()
                date_str = death_match.group(3).strip().rstrip('.,;')
                age = death_match.group(4).strip()

                # Name parsing
                name_parts = [p for p in raw_name.split() if p.lower() not in honorifics]
                first = middle = last = None
                if len(name_parts) == 1:
                    first = name_parts[0]
                    last = prev_last_name
                elif len(name_parts) == 2:
                    if len(name_parts[1]) == 1:
                        first, middle = name_parts
                        last = prev_last_name
                    else:
                        last, first = name_parts
                elif len(name_parts) >= 3:
                    last, first = name_parts[:2]
                    if len(name_parts[2]) == 1:
                        middle = name_parts[2]
                if quoted and not last:
                    last = prev_last_name
                if last:
                    prev_last_name = last
                dead_entries.append({
                    "FirstName": first,
                    "MiddleInitial": middle,
                    "LastName": last,
                    "DateOfDeath": date_str,
                    "Age": age,
                    "DirectoryName": directory_name,
                    "PageNumber": page_number
                })
            continue

        name_match = re.match(r'^(")?\s*([^\n,]+),\s*(.+)', entry)
        if not name_match:
            continue

        quoted = name_match.group(1) == '"'
        raw_name_str = name_match.group(2).strip()
        rest = name_match.group(3).strip()

        # Parse moved entries: "moved to <location>"
        move_match = re.match(r'(moved|removed) to\s+(.*)', rest, re.IGNORECASE)
        if move_match:
            location = move_match.group(2).strip().rstrip('.')
            name_parts = [p for p in raw_name_str.split() if p.lower() not in honorifics]
            first = middle = last = None
            if len(name_parts) == 1:
                first = name_parts[0]
                last = prev_last_name
            elif len(name_parts) == 2:
                if len(name_parts[1]) == 1:
                    first, middle = name_parts
                    last = prev_last_name
                else:
                    last, first = name_parts
            elif len(name_parts) >= 3:
                last, first = name_parts[:2]
                if len(name_parts[2]) == 1:
                    middle = name_parts[2]
            if quoted and not last:
                last = prev_last_name
            if last:
                prev_last_name = last
            resident_entries.append({
                "FirstName": first,
                "MiddleInitial": middle,
                "LastName": last,
                "Spouse": None,
                "Occupation": None,
                "CompanyName": None,
                "HomeAddress": {
                    "Raw": location,
                    "Indicator": "moved"
                },
                "Telephone": None,
                "DirectoryName": directory_name,
                "PageNumber": page_number
            })
            continue

        spouse = None
        # Spouse name extraction if "wid" mentioned
        paren_match = re.search(r'\((.*?)\)', raw_name_str)
        if paren_match:
            paren_content = paren_match.group(1).strip()
            wid_match = re.match(r'wid\s+(.*)', paren_content, re.IGNORECASE)
            if wid_match:
                spouse = wid_match.group(1).strip()
            raw_name_str = re.sub(r'\s*\([^)]*\)', '', raw_name_str).strip()

        name_str = raw_name_str

        # Skip "see also" cross-references
        if re.match(r'see also\b', rest.lower()):
            prev_last_name = name_str
            continue

        # Extract address and indicator (r/b/rms)
        addr_match = re.search(r'\b(r|b|rms)\b\.?\s*(?P<addr>[^.,\n"]+)', rest)
        if not addr_match:
            continue
        indicator = addr_match.group(1)
        address = addr_match.group("addr").strip().rstrip('.')

        # Track previous address if "same"
        if address.lower() == "same":
            address = prev_address["Raw"]
            indicator = prev_address["Indicator"]
        else:
            prev_address = {"Raw": address, "Indicator": indicator}

        # Extract telephone number if available
        tel_match = re.search(r'\btel\.?\s*([^.,;\n]+)', rest, re.IGNORECASE)
        telephone = tel_match.group(1).strip() if tel_match else None

        # Parse occupation and company from pre-address text
        pre_address = rest[:addr_match.start()].strip().rstrip(",")
        tokens = pre_address.split()
        occupation = tokens[0] if tokens else None
        company = " ".join(tokens[1:]) + "," if len(tokens) > 1 else ""

        # Determine first, middle, last name from tokens
        name_parts = [p for p in name_str.split() if p.lower() not in honorifics]
        first = middle = last = None
        if len(name_parts) == 1:
            first = name_parts[0]
            last = prev_last_name
        elif len(name_parts) == 2:
            if len(name_parts[1]) == 1:
                first, middle = name_parts
                last = prev_last_name
            else:
                last, first = name_parts
        elif len(name_parts) >= 3:
            last, first = name_parts[:2]
            if len(name_parts[2]) == 1:
                middle = name_parts[2]
        if quoted and not last:
            last = prev_last_name
        if last:
            prev_last_name = last

        resident_entries.append({
            "FirstName": first,
            "MiddleInitial": middle,
            "LastName": last,
            "Spouse": spouse,
            "Occupation": occupation,
            "CompanyName": company,
            "HomeAddress": {
                "Raw": address,
                "Indicator": indicator
            },
            "Telephone": telephone,
            "DirectoryName": directory_name,
            "PageNumber": page_number
        })

    # Post-processing: cleanup and split address fields
    for entry in resident_entries:
        entry["FirstName"] = clean_text(entry.get("FirstName"))
        entry["MiddleInitial"] = clean_text(entry.get("MiddleInitial"))
        entry["LastName"] = clean_text(entry.get("LastName"))
        entry["Spouse"] = clean_text(entry.get("Spouse"))
        entry["Occupation"] = clean_text(entry.get("Occupation"))
        entry["CompanyName"] = clean_text(entry.get("CompanyName"))
        entry["Telephone"] = clean_text(entry.get("Telephone"))
        entry["HomeAddress"]["Raw"] = clean_text(entry["HomeAddress"]["Raw"])
        raw_address = entry["HomeAddress"].get("Raw")
        if raw_address:
            parts = raw_address.split(maxsplit=1)
            if parts and parts[0].isdigit():
                entry["HomeAddress"]["StreetNumber"] = parts[0]
                entry["HomeAddress"]["StreetName"] = parts[1] if len(parts) > 1 else None
            else:
                entry["HomeAddress"]["StreetNumber"] = None
                entry["HomeAddress"]["StreetName"] = raw_address
        del entry["HomeAddress"]["Raw"]

    all_residents.extend(resident_entries)
    all_deceased.extend(dead_entries)

with open("residents_combined.json", "w") as f:
    json.dump(all_residents, f, indent=2)

with open("deceased_combined.json", "w") as f:
    json.dump(all_deceased, f, indent=2)


In [9]:
for entry in all_residents:
    company = entry.get("CompanyName")

    work_address = None
    if isinstance(company, str) and company.strip():
        # If company name has a numeric part (e.g., address)
        match = re.search(r'\b(\d+.*)', company)
        if match:
            work_address = match.group(1).strip()
            company_cleaned = company[:match.start()].strip(", ").strip()
            entry["CompanyName"] = company_cleaned if company_cleaned else None
        else:
            entry["CompanyName"] = company.strip(", ").strip() or None
    else:
        entry["CompanyName"] = None

    entry["WorkAddress"] = work_address


In [11]:
moved_residents = []

for entry in all_residents:
    if entry.get("HomeAddress", {}).get("Indicator") == "moved":
        new_location = entry["HomeAddress"].get("StreetName") or ""
        moved_entry = entry.copy()
        moved_entry["NewLocation"] = new_location
        # Remove street info, keep only NewLocation
        del moved_entry["HomeAddress"]
        moved_residents.append(moved_entry)

# Remove moved entries from the main all_residents list
all_residents = [
    r for r in all_residents if r.get("HomeAddress", {}).get("Indicator") != "moved"
]


In [12]:
with open("residents.json", "w") as f:
    json.dump(all_residents, f, indent=2)

with open("deceased.json", "w") as f:
    json.dump(all_deceased, f, indent=2)

with open("moved.json", "w") as f:
    json.dump(moved_residents, f, indent=2)
