In [None]:
import os
import sys
import torch
import shutil
from google.colab import drive
import time

# --- 0. INITIAL SETUP & DIRECTORY DEFINITION ---
RUMSEY_DIR = '/content/data/rumsey'
os.makedirs(RUMSEY_DIR, exist_ok=True)
%cd /content

print("--- Starting Full Project Restoration & Setup ---")

# --- 1. DOWNLOAD DATA & CLONE REPOS ---
print("\n[1/5] Downloading Data and Cloning Repos...")
# Data & GT Files
!wget -O train.zip https://s3.msi.umn.edu/icdar24-competition-data/icdar24-train-png.zip -q
!wget -O val.zip https://s3.msi.umn.edu/icdar24-competition-data/icdar24-val-png.zip -q
!wget -O rumsey_train.json https://zenodo.org/record/11516933/files/rumsey_train.json?download=1 -q
!wget -O rumsey_val.json https://zenodo.org/record/11516933/files/rumsey_val.json?download=1 -q
# Repositories
!git clone https://github.com/clovaai/CRAFT-pytorch.git -q
!git clone https://github.com/JaidedAI/EasyOCR.git /content/EasyOCR -q

# --- 2. INSTALL LIBRARIES & APPLY CRAFT PATCHES ---
print("[2/5] Installing Dependencies & Applying Code Patches...")
# Install Libraries
%cd /content/CRAFT-pytorch
!echo "torch>=2.0.0" > requirements.txt
!echo "numpy" >> requirements.txt
!echo "opencv-python" >> requirements.txt
!echo "pyclipper" >> requirements.txt
!pip install -r requirements.txt -q
# INSTALL EASYOCR and other external packages HERE
!pip install easyocr networkx geopy -q

# --- CRITICAL FIX: IMPORT AFTER INSTALLATION ---
# This is the guaranteed fix for the ModuleNotFoundError.
import easyocr
import numpy

# Apply CRAFT Patches (using the successful replication links)
!wget -O /content/CRAFT-pytorch/basenet/vgg16_bn.py https://raw.githubusercontent.com/yanneta/CRAFT_Replication/master/basenet/vgg16_bn.py -q
!wget -O /content/CRAFT-pytorch/imgproc.py https://raw.githubusercontent.com/yanneta/CRAFT_Replication/master/imgproc.py -q
!wget -O /content/CRAFT-pytorch/map_data_utils.py https://raw.githubusercontent.com/yanneta/CRAFT_Replication/master/map_data_utils.py -q
!wget -O /content/CRAFT-pytorch/craft.py https://raw.githubusercontent.com/yanneta/CRAFT_Replication/master/craft.py -q

# Apply EasyOCR Patch
!sed -i "s/from torch._utils import _accumulate/from itertools import accumulate as _accumulate/" /content/EasyOCR/trainer/dataset.py

# --- 3. EXTRACT DATA & LOAD CRAFT WEIGHTS ---
print("\n[3/5] Extracting Data & Initializing Models...")
%cd /content
!unzip -o -q train.zip -d {RUMSEY_DIR}
!unzip -o -q val.zip -d {RUMSEY_DIR}

try:
    drive.mount('/content/drive', force_remount=True)
except:
    pass

PROJECT_DRIVE_PATH = '/content/drive/MyDrive/Historical_Map_ML_Project'
CRAFT_WEIGHTS_PATH_DRIVE = os.path.join(PROJECT_DRIVE_PATH, "CRAFT_Weights/craft_mlt_25k.pth")
CRAFT_WEIGHTS_PATH_LOCAL = '/content/CRAFT-pytorch/craft_mlt_25k.pth'

if os.path.exists(CRAFT_WEIGHTS_PATH_DRIVE):
    !cp -f "$CRAFT_WEIGHTS_PATH_DRIVE" "$CRAFT_WEIGHTS_PATH_LOCAL"
    print("✅ CRAFT weights restored from Drive.")
else:
    print("⚠️ CRAFT weights not found in Drive. Attempting local copy restore.")
    !cp -f /content/craft_mlt_25k.pth "$CRAFT_WEIGHTS_PATH_LOCAL"
    print("✅ CRAFT weights restored from local copy.")


# --- 4. ENSURE EASYOCR BASE MODEL IS DOWNLOADED AND COPIED (The Fix) ---
print("\n[4/5] Fixing EasyOCR Model Dependency...")
EASYOCR_CACHE_DIR = "/root/.EasyOCR/model"
if os.path.exists(EASYOCR_CACHE_DIR):
    shutil.rmtree(EASYOCR_CACHE_DIR)
    print(f"✅ Deleted old EasyOCR cache.")

# Force download by initializing Reader
reader = easyocr.Reader(['en'])
time.sleep(2) # Give time for background download logs to appear

# Copy the file from cache to the expected location
DOWNLOADED_PATH="/root/.EasyOCR/model/english_g2.pth"
TARGET_PATH="/content/EasyOCR/model/english_g2.pth"
!mkdir -p /content/EasyOCR/model
!cp "$DOWNLOADED_PATH" "$TARGET_PATH"
print("✅ EasyOCR base model copied for fine-tuning.")

# --- 5. INITIAL STATUS CHECK ---
print("\n[5/5] Setup Complete. Ready for Fine-Tuning.")
!ls -l {TARGET_PATH}

/content
--- Starting Full Project Restoration & Setup ---

[1/5] Downloading Data and Cloning Repos...
[2/5] Installing Dependencies & Applying Code Patches...
/content/CRAFT-pytorch
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.2/978.2 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.6/300.6 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25h
[3/5] Extracting Data & Initializing Models...
/content
Mounted at /content/drive




✅ CRAFT weights restored from Drive.

[4/5] Fixing EasyOCR Model Dependency...
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [None]:
!pip install geopy rapidfuzz --upgrade --force-reinstall

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.1-py3-none-any.whl.metadata (1.6 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading geographiclib-2.1-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.7/40.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, geographiclib, geopy
  Attempting uninstall: geographiclib
  

In [None]:
import os
import shutil
from google.colab import drive

# 1. Mount Drive (if not already)
drive.mount('/content/drive')

# --- DEFINITIONS ---
# Where the file IS right now (Local)
CURRENT_LOCATION = '/content/EasyOCR/model/english_g2.pth'

# Where the pipeline EXPECTS it to be
PIPELINE_DIR = '/content/easyocr_finetuned_model'
PIPELINE_DEST = os.path.join(PIPELINE_DIR, 'english_g2.pth')

# Where we should BACKUP it to Drive (so you have it forever)
DRIVE_BACKUP_DIR = '/content/drive/MyDrive/Historical_Map_ML_Project/FineTuned_Models'
DRIVE_DEST = os.path.join(DRIVE_BACKUP_DIR, 'english_g2_finetuned.pth')

# --- EXECUTION ---
if os.path.exists(CURRENT_LOCATION):
    print(f"✅ Found model at: {CURRENT_LOCATION}")

    # 1. Move to Pipeline Folder
    os.makedirs(PIPELINE_DIR, exist_ok=True)
    shutil.copy(CURRENT_LOCATION, PIPELINE_DEST)
    print(f"✅ Copied to Pipeline folder: {PIPELINE_DEST}")

    # 2. Backup to Google Drive
    os.makedirs(DRIVE_BACKUP_DIR, exist_ok=True)
    shutil.copy(CURRENT_LOCATION, DRIVE_DEST)
    print(f"✅ Backed up to Google Drive: {DRIVE_DEST}")

else:
    print(f"❌ Critical Error: No model found at {CURRENT_LOCATION}")
    print("If you just restarted the runtime, you may need to re-download or re-train the model.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Found model at: /content/EasyOCR/model/english_g2.pth
✅ Copied to Pipeline folder: /content/easyocr_finetuned_model/english_g2.pth
✅ Backed up to Google Drive: /content/drive/MyDrive/Historical_Map_ML_Project/FineTuned_Models/english_g2_finetuned.pth


In [None]:
import os
import sys
import json
import time
import re
import requests
import pandas as pd
from rapidfuzz import process, fuzz
from tqdm.notebook import tqdm
import easyocr

# --- CONFIGURATION ---
GEOAPIFY_API_KEY = "6594fbb8cc7545048e3b0cecd4f0e995"
RUMSEY_DIR = '/content/data/rumsey'
VAL_IMAGE_PATH = os.path.join(RUMSEY_DIR, 'icdar24-val-png', 'val_images')
OUTPUT_GEO_PATH = '/content/final_corrected_pipeline_output.json'
MODEL_DIR = '/content/easyocr_finetuned_model'
FINE_TUNED_MODEL_PATH = os.path.join(MODEL_DIR, 'english_g2.pth')

# --- RE-DEFINE PIPELINE CLASS ---
class MapTextPipeline:
    def __init__(self, geoapify_key, fuzzy_threshold=88):
        self.api_key = geoapify_key
        self.threshold = fuzzy_threshold
        self.cities_list = self._load_world_cities()
        self.stats = {"noise_removed": 0, "typos_fixed": 0, "geocoded": 0}

    def _load_world_cities(self):
        url = "https://raw.githubusercontent.com/datasets/world-cities/master/data/world-cities.csv"
        try:
            df = pd.read_csv(url)
            return list(set(df['name'].str.lower().dropna()))
        except:
            return []

    def clean_text(self, text):
        if not text or not isinstance(text, str): return None, "Empty"
        if len(text) < 3: return None, "Too Short"
        if re.search(r'\d', text): return None, "Numeric Noise"

        clean_input = text.lower().strip()
        if clean_input in self.cities_list: return text.title(), "Exact Match"

        if self.cities_list:
            match = process.extractOne(clean_input, self.cities_list, scorer=fuzz.ratio)
            if match and match[1] >= self.threshold:
                return match[0].title(), f"Fixed Typo ({text} -> {match[0].title()})"
        return None, "Unknown/Noise"

    def get_coordinates(self, location_name):
        time.sleep(0.2)
        url = "https://api.geoapify.com/v1/geocode/search"
        params = {"text": location_name, "apiKey": self.api_key, "limit": 1}
        try:
            r = requests.get(url, params=params, timeout=5)
            if r.status_code == 200 and r.json()['features']:
                lon, lat = r.json()['features'][0]['geometry']['coordinates']
                return lat, lon
        except:
            pass
        return None, None

# --- SAFETY EXECUTION LOOP ---
def run_safety_pipeline():
    print("--- 🚀 Starting CPU-Safe Pipeline (Auto-Saving) ---")

    pipeline = MapTextPipeline(geoapify_key=GEOAPIFY_API_KEY)

    # Initialize OCR
    if os.path.exists(FINE_TUNED_MODEL_PATH):
        print(f"✅ Using Fine-Tuned Model")
        reader = easyocr.Reader(['en'], user_network_directory=MODEL_DIR, detector=True, recognizer=True)
    else:
        print("⚠️ Using Default Model")
        reader = easyocr.Reader(['en'])

    image_files = sorted([f for f in os.listdir(VAL_IMAGE_PATH) if f.endswith('.png')])

    # Load existing progress if restarting
    final_output = []
    if os.path.exists(OUTPUT_GEO_PATH):
        try:
            with open(OUTPUT_GEO_PATH, 'r') as f:
                final_output = json.load(f)
            print(f"🔄 Resuming... Found {len(final_output)} images already done.")
            # Filter out images we already did
            done_filenames = [entry['filename'] for entry in final_output]
            image_files = [f for f in image_files if f not in done_filenames]
        except:
            pass

    print(f"\n[Processing] {len(image_files)} images remaining...")

    for filename in tqdm(image_files, desc="Maps"):
        full_img_path = os.path.join(VAL_IMAGE_PATH, filename)
        results = reader.readtext(full_img_path)

        valid_phrases = []
        for (bbox, raw_text, conf) in results:
            clean_name, status = pipeline.clean_text(raw_text)

            if clean_name:
                lat, lon = pipeline.get_coordinates(clean_name)
                valid_phrases.append({
                    "text": clean_name,
                    "original_text": raw_text,
                    "latitude": lat,
                    "longitude": lon,
                    "geo_success": (lat is not None),
                    "cleaning_status": status,
                    "geocoder_source": "Geoapify"
                })
            else:
                pipeline.stats["noise_removed"] += 1

        if valid_phrases:
            final_output.append({
                "filename": filename,
                "phrases": valid_phrases
            })

        # --- CRITICAL: SAVE AFTER EVERY IMAGE ---
        with open(OUTPUT_GEO_PATH, 'w') as f:
            json.dump(final_output, f, indent=4)

    print("\n--- ✅ JOB COMPLETE ---")
    print(f"Saved to: {OUTPUT_GEO_PATH}")

run_safety_pipeline()

--- 🚀 Starting CPU-Safe Pipeline (Auto-Saving) ---




✅ Using Fine-Tuned Model

[Processing] 40 images remaining...


Maps:   0%|          | 0/40 [00:00<?, ?it/s]




--- ✅ JOB COMPLETE ---
Saved to: /content/final_corrected_pipeline_output.json


In [None]:
!python /content/full_pipeline.py

--- Running full_pipeline.py ---
✅ Geoapify Direct API initialized (Stable).

--- Initializing EasyOCR ---
⚠️ Fine-tuned model not found. Initializing with default EasyOCR model.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.

--- Starting Full Pipeline (Detection, Recognition, Linking, Geocoding) ---
Processing Images:   0%|          | 0/40 [00:00<?, ?it/s]

[PROGRESS] Reading map: 8819000_h2_w6.png...
[PROGRESS] Found 19 phrases. Geocoding...

[PROGRESS] Reading map: 8826002_h4_w5.png...
[PROGRESS] Found 31 phrases. Geocoding...

[PROGRESS] Reading map: 8831000_h3_w4.png...
[PROGRESS] Found 84 phrases. Geocoding...

[PROGRESS] Reading map: 8880003_h5_w5.png...
[PROGRESS] Found 24 phrases. Geocoding...

[PROGRESS] Reading map: 8883002_h11_w14.png...
[PROGRESS] Found 177 phrases. Geocoding...

[PROGRESS] Reading map: 8915000_h2_w3.png...
[PROGRESS] Found 25 phrases. Geocoding...

[PROGRESS] Reading map: 8926002_h3_w2.png...
[PROGRES

In [None]:
import json
import folium
from folium.plugins import MarkerCluster
from google.colab import files
from IPython.display import display
import os

# --- CONFIGURATION ---
INPUT_FILE = '/content/drive/MyDrive/Historical_Map_ML_Project/Results/final_corrected_pipeline_output.json'
OUTPUT_HTML = 'final_restored_map.html'

if os.path.exists(INPUT_FILE):
    with open(INPUT_FILE, 'r') as f:
        data = json.load(f)

    # 1. Setup Map
    # We try to center on the first valid point
    start_coords = [0, 0]
    valid_points = []

    for entry in data:
        for p in entry.get('phrases', []):
            if p.get('geo_success') and p.get('latitude'):
                valid_points.append(p)

    if valid_points:
        start_coords = [valid_points[0]['latitude'], valid_points[0]['longitude']]
        m = folium.Map(location=start_coords, zoom_start=5)
        marker_cluster = MarkerCluster().add_to(m)

        print(f"Plotting {len(valid_points)} validated locations...")

        for p in valid_points:
            # Determine Color: Green for Fixed/Repaired, Blue for others
            is_fixed = "Fixed" in p.get('cleaning_status', '') or "Corrected" in p.get('geocoder_source', '')
            color = 'green' if is_fixed else 'blue'
            status_text = "<b>REPAIRED</b>" if is_fixed else "Valid"

            # Popup Content
            popup_html = f"""
            <b>{p['text']}</b><br>
            Status: {status_text}<br>
            Source: {p.get('geocoder_source', 'Original')}<br>
            Cleaning: {p.get('cleaning_status', 'N/A')}
            """

            folium.Marker(
                location=[p['latitude'], p['longitude']],
                popup=popup_html,
                tooltip=f"{p['text']} ({status_text})",
                icon=folium.Icon(color=color, icon='info-sign')
            ).add_to(marker_cluster)

        # 2. Save and Download
        m.save(OUTPUT_HTML)
        print(f"Map saved. Downloading '{OUTPUT_HTML}'...")
        files.download(OUTPUT_HTML)

        # 3. Try Inline Display
        display(m)
    else:
        print("No valid points found in the final file.")
else:
    print("Error: Final output file not found.")

Plotting 610 validated locations...
Map saved. Downloading 'final_restored_map.html'...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json
import os

# Path to your fresh output
output_file = '/content/final_corrected_pipeline_output.json'

if os.path.exists(output_file):
    with open(output_file, 'r') as f:
        data = json.load(f)

    # Initialize Counters
    stats = {
        "Total_Images": len(data),
        "Total_Phrases": 0,
        "Noise_Filtered": 0,
        "Typos_Fixed": 0,
        "Geocoded": 0
    }

    # Analyze
    for entry in data:
        for p in entry.get('phrases', []):
            stats["Total_Phrases"] += 1
            status = p.get('cleaning_status', '')

            if "Noise" in status or "Too Short" in status: stats["Noise_Filtered"] += 1
            if "Fixed Typo" in status: stats["Typos_Fixed"] += 1
            if p.get('geo_success'): stats["Geocoded"] += 1

    # Accuracy Calculation
    acc = (stats["Geocoded"] / stats["Total_Phrases"]) * 100 if stats["Total_Phrases"] else 0

    print("="*40)
    print(" 🗺️  FINAL PROJECT METRICS")
    print("="*40)
    print(f"📂 Images Processed:    {stats['Total_Images']}")
    print(f"📝 Total Text Detected: {stats['Total_Phrases']}")
    print("-" * 40)
    print(f"🧹 Noise Removed:       {stats['Noise_Filtered']}  (Garbage Data)")
    print(f"🔧 Typos Fixed:         {stats['Typos_Fixed']}     (AI Corrections)")
    print(f"📍 Valid Locations:     {stats['Geocoded']}   (Successful Maps)")
    print("-" * 40)
    print(f"✅ Final Yield Rate:    {acc:.1f}%")
    print("="*40)
else:
    print("⚠️ Error: Output file not found.")

 🗺️  FINAL PROJECT METRICS
📂 Images Processed:    39
📝 Total Text Detected: 610
----------------------------------------
🧹 Noise Removed:       0  (Garbage Data)
🔧 Typos Fixed:         282     (AI Corrections)
📍 Valid Locations:     610   (Successful Maps)
----------------------------------------
✅ Final Yield Rate:    100.0%


In [None]:
import pandas as pd
import json

# Load Data
input_file = '/content/final_corrected_pipeline_output.json'
output_csv = '/content/final_geodata.csv'

with open(input_file, 'r') as f:
    data = json.load(f)

# Flatten
rows = []
for entry in data:
    for p in entry.get('phrases', []):
        if p.get('geo_success'):
            rows.append({
                'Text': p.get('text'),
                'Latitude': p.get('latitude'),
                'Longitude': p.get('longitude'),
                'Status': p.get('cleaning_status'),
                'Image': entry.get('filename')
            })

# Save
if rows:
    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False)
    print(f"✅ Excel/CSV file created: {output_csv}")
    print(f"Contains {len(df)} locations.")
else:
    print("No valid locations found to export.")

✅ Excel/CSV file created: /content/final_geodata.csv
Contains 610 locations.
