## Here we try to extract the MGRS (Military Grid Reference System) Coordinates from the svg element.

We assume that the coordinates are created through path elements and display complex polygons as text. So we think the coordinates are just svg path elements with very long d attributes (between 3000 and 6000). After we found them we also try to reengineer the position of the element from the original svg. Therefore we look at the svg transformation from the parent elements of the specific path element which might be the coordinates. 

In the end we create a little png from the isolated svg path element and do OCR to get the Text machine readable from the svg path element. 

In [12]:
from bs4 import BeautifulSoup
import cairosvg
import pytesseract
from PIL import Image
import mgrs
import json


In [2]:
def calculate_coordinates(path):
    parent = path.parent

    start_coordinates_path_element = ()

    while parent:
        print("Parent Element:", parent.name)
        for attribute, value in parent.attrs.items():

            if attribute == "transform":
                
                print(f"{attribute}: {value}")
                # here we extract the start coordinate information  for the path element out of the transform attributes from the svg parent elements
                transformations = value.split("(")[-1].split(")")[0].split(",")
                # if attribute transform has 1 or 2 elements its just a relocation (the function translate) of the coordinates in either x or x and y direction
                # if attribute transform has 6 elements its the function matrix. Here is what each position does:
                # 1. scaling x coordinates
                # 2. skewing transformation alongside x axis
                # 3. skewing transformation alongside y axis
                # 4. scaling y coordinates 
                # 5. translation in x direction
                # 6. translation in y direction

                if len(transformations) == 2:
                    start_coordinates_path_element = (float(transformations[0]), float(transformations[1]))
            
                elif len(transformations) == 1:
                    start_coordinates_path_element = (start_coordinates_path_element[0] + float(transformations[0]) , start_coordinates_path_element[1])

                elif len(transformations) == 6:

                    #1. scaling x coordinates
                    start_coordinates_path_element =  (start_coordinates_path_element[0] * float(transformations[0]), start_coordinates_path_element[1])

                    #4. scaling y coordinates and 6. translate in y directio 
                    start_coordinates_path_element = (start_coordinates_path_element[0], (float(transformations[3]) * start_coordinates_path_element[1]) + float(transformations[5]) )


                print("neu berechnete Koordinaten: ")
                print(start_coordinates_path_element)
        
        parent = parent.parent
    print("Endgültige Startkoordinatne für path Element: ")
    print(start_coordinates_path_element)
    return start_coordinates_path_element

In [3]:
def find_mgrs_coordinates(soup, file_id):

    min_length = 2000
    max_length = 6000

    svg_elements = []

    for path in soup.find_all('path'):
        d_length = len(path.get('d', ''))  # Get the length of the `d` attribute
        if min_length <= d_length <= max_length:
            svg_elements.append(path)


    #military grid reference system
    mgrs_coordinates = []

    count_coordinate = 0
    # Output the paths or manipulate them as needed
    for path in svg_elements:
        count_coordinate += 1 # count how many coordinates appear in the pdf
        
        start_coordinates_path_element = calculate_coordinates(path)
        
        #now we manipulate the svg path element in a way that we can create a picture out of it for OCR

        svg_drawing = "<" + (str(path).split("/>")[0] + ' transform="scale(1, -1) translate(0, -10)"' + "/>").split("svg:")[-1] 
        svg_drawing = '<svg xmlns="http://www.w3.org/2000/svg" width="80" height="20">' + svg_drawing + '</svg>'

        name = file_id + "_num_" + str(count_coordinate)

        mgrs_coordinates.append({"path" : path, "coordinates": start_coordinates_path_element, "svg_drawing": svg_drawing, "file_id": name})
    return mgrs_coordinates


# Now we create a picture of the svg path element to prepare for OCR

In [4]:
# 1. we create an svg image from the coordinate
# 2. convert the svg to png
# 3. ocr on the png
# 4. return coordinates as text

def ocr_image(mgrs_coordinates):

    svg_path= mgrs_coordinates[0]["svg_drawing"]
    file_id = mgrs_coordinates[0]["file_id"]

    # Open the file in write mode and write the SVG content
    with open(f"data/ocr_images/{file_id}.svg", 'w') as file:
        file.write(svg_path)

    # Convert SVG to PNG
    cairosvg.svg2png(url = f"data/ocr_images/{file_id}.svg", write_to=f"data/ocr_images/{file_id}.png")

    img = Image.open(f"data/ocr_images/{file_id}.png")
    # Use Tesseract to do OCR on the image
    text = pytesseract.image_to_string(img)
    return text.strip()
    

## find all used colors

In [5]:
from bs4 import BeautifulSoup


def find_all_used_colors(pdf):
    # Open and read the SVG file
    with open(pdf, 'r') as file:
        svg_content = file.read()

    soup = BeautifulSoup(svg_content, 'xml')  # Use 'xml' parser for parsing SVG


    unique_colors = set()  # Set to store unique colors

    for path in soup.find_all('path'):
        style = path.get('style')  # Get the style attribute of the path
        if style:
            styles = style.split(';')  # Split style into individual properties
            fill_style = [s for s in styles if 'fill:' in s]  # Find the fill style
            if fill_style:
                color = fill_style[0].split(':')[1].strip()  # Extract the color value
                unique_colors.add(color)  # Add to set of unique colors

    # Output the unique colors
    return unique_colors


In [6]:
find_all_used_colors("./data/2583-II.svg")

{'#000000', '#00ff00', '#00ffff', '#ff0000', '#ffff00', '#ffffff', 'none'}

## find yellow cross element

In [65]:
def find_cross_coordinates(soup, file_id):
    
    target_color = '#ffff00'
    paths_with_target_color = []

    for path in soup.find_all('path'):
        # Check within the style attribute
        if path.get('style'):
            styles = dict(item.split(':') for item in path['style'].split(';') if item)
            if 'fill' in styles and styles['fill'].strip() == target_color:
                if len(path.get('d', '')) == 63: # here we filter out the yellow crosses because they have a length of 63
                    paths_with_target_color.append(path)
                    print(path)

    assert len(paths_with_target_color) == 2, f"In {file_id} are no, only one or too many yellow crosses"

    # To get the center of the cross one has to add 4.56 in y direction because the drawing start at the bottom.

    coordinates = []

    for path in paths_with_target_color:

        coord = calculate_coordinates(path)

        coord = (coord[0], coord[1])

        coordinates.append(coord)

    return coordinates

# Find suspect areas


In [66]:

from svgpathtools import parse_path
from svgpathtools import Path, Line
import math

def extract_number(string):
    """Helper function to extract the first number from a string."""
    return int(''.join(filter(str.isdigit, string)))

def compare_tuples(tuple1, tuple2, rel_tol=1e-4, abs_tol=1e-6):
    return (math.isclose(tuple1[0], tuple2[0], rel_tol=rel_tol, abs_tol=abs_tol) and
            math.isclose(tuple1[1], tuple2[1], rel_tol=rel_tol, abs_tol=abs_tol))


def apply_svg_path_to_coordinate(path_str: str, coord: tuple) -> tuple:

    print(path_str)
    # Function which applies path manipulations from d element to coordinates (x,y)
    path = parse_path(path_str)
    x, y = coord

    # Initial point
    current_point = complex(x, y)

    for segment in path:
        if isinstance(segment, Line):
            end_point = current_point + (segment.end - segment.start)
            current_point = end_point
        else:
            raise NotImplementedError("Only line segments are implemented in this example.")
    
    return current_point.real, current_point.imag


def find_suspect_areas(soup):
 

    # Find all path elements
    path_elements = soup.find_all('path')

    # Filter path elements with black stroke color
    black_paths = [path for path in path_elements if 'stroke:#000000' in path.get('style', '') and 'stroke-width:3.84' in path.get('style', '')]

    # Print the result
    suspect_areas = []
    current_group = []

    for path in black_paths:
        
        length_d = len(path.get('d', ''))

        if length_d >= 8 and length_d <= 25:
            if len(current_group) == 0:
                current_group.append(path)
            else:
                current_id = extract_number(path.get("id", ""))
                previous_id = extract_number(current_group[-1].get("id", ""))

                current_coords = path.parent.get("transform", "").split("(")[-1].split(")")[0].split(",")
                current_coords = (float(current_coords[0]), float(current_coords[1]))

                previous_coords = current_group[-1].parent.get("transform", "").split("(")[-1].split(")")[0].split(",")
                previous_coords = (float(previous_coords[0]), float(previous_coords[1]))

                check_coords = apply_svg_path_to_coordinate(current_group[-1].get("d", ""), previous_coords)

                #print(previous_coords, check_coords, current_coords)

                if (current_id - previous_id) == 6 and compare_tuples(check_coords, current_coords):
                    current_group.append(path)
                else:
                    suspect_areas.append(current_group)
                    current_group = [path]
        else:
            if current_group:
                suspect_areas.append(current_group)
                current_group = []

    if current_group:
        suspect_areas.append(current_group)


      # Apply calculate_coordinates() to each path in suspect_areas

    
    suspect_areas_with_coordinates = [
        [calculate_coordinates(path) for path in group]
        for group in suspect_areas
    ]
        
    for item in suspect_areas_with_coordinates:
        print(item)

  
    #check in 2385-II.svg for polygon artifacts
    for item in suspect_areas[19]:
        print(item)
        print(calculate_coordinates(item))

    
        
    return suspect_areas_with_coordinates



In [67]:
def pdf_to_mgrs_coordinates(coords, width, height, cross_cords, cross_mgrs):

    width_pixel_per_mm = float(width)/533.0
    height_pixel_per_mm = float(height)/736.0

    width_meter_per_pixel = 50/width_pixel_per_mm
    height_meter_per_pixel = 50/height_pixel_per_mm

    h_move = (coords[0] - cross_cords[0]) * width_meter_per_pixel
    v_move = (cross_cords[1]-coords[1]) * height_meter_per_pixel

    grid_zone = cross_mgrs[0:3]
    square_identifier = cross_mgrs[3:5]
    easting = float(cross_mgrs[5:7]) * 1000
    northing = float(cross_mgrs[7:9]) *1000


    new_easting = int(easting + h_move)
    new_northing = int(northing + v_move) ## here we loose some information beacuse we parse float to int

    new_mgrs = grid_zone + square_identifier + str(new_easting) + str(new_northing)

    return new_mgrs

    





    

In [68]:
def convert_mgrs_to_latlon(mgrs_str):
    m = mgrs.MGRS()
    # Convert MGRS string to latitude and longitude
    latlon = m.toLatLon(mgrs_str.encode('utf-8'))
    return latlon

In [69]:
def create_geojson(polygons, output_path):
        # Convert each polygon to the GeoJSON format
    features = []
    for polygon in polygons:
        # Ensure the coordinates are properly formatted as GeoJSON requires nested lists
        geojson_polygon = [[list(coord)[::-1] for coord in polygon]]

        features.append({
            "type": "Feature",
            "geometry": {
                "type": "Polygon",
                "coordinates": geojson_polygon
            },
            "properties": {}
        })

    # Create the GeoJSON FeatureCollection
    geojson = {
        "type": "FeatureCollection",
        "features": features
    }

        # Save the GeoJSON to a file
    with open(output_path, 'w') as f:
        json.dump(geojson, f, indent=2)

In [70]:
def extract_suspect_areas(svg):

    with open(svg, 'r') as file:
        svg_content = file.read()
  
    soup = BeautifulSoup(svg_content, 'xml')
        # Find the svg tag

    svg_tag = soup.find('svg')

    file_id = svg.split("/")[-1].split(".")[0]


    # Get the width and height attributes
    width = svg_tag.get('width')
    height = svg_tag.get('height')
    
    cross_mgrs = find_mgrs_coordinates(soup, file_id)
    cross_mgrs_text = ocr_image(cross_mgrs)
    cross_cords = find_cross_coordinates(soup, file_id)[0] # take only the first cross
   

    suspect_polygons = find_suspect_areas(soup)

    for polygon in suspect_polygons:
        for i, coord in enumerate(polygon):
            mgrs_coord = pdf_to_mgrs_coordinates(coord, width, height, cross_cords, cross_mgrs_text)
            latlon = convert_mgrs_to_latlon(mgrs_coord)
            polygon[i] = latlon


    print(f"cross cords {cross_cords}")
    create_geojson(suspect_polygons, f"static/{file_id}.geojson")

    


In [71]:
extract_suspect_areas("data/2384-I.svg")

Parent Element: g
transform: translate(361.44,1085.76)
neu berechnete Koordinaten: 
(361.44, 1085.76)
Parent Element: g
transform: translate(16.8)
neu berechnete Koordinaten: 
(378.24, 1085.76)
Parent Element: g
transform: matrix(1.3333333,0,0,-1.3333333,0,2781.7333)
neu berechnete Koordinaten: 
(504.31998739200003, 1334.0533361919997)
Parent Element: svg
Parent Element: [document]
Endgültige Startkoordinatne für path Element: 
(504.31998739200003, 1334.0533361919997)
Parent Element: g
transform: translate(927.84,1135.2)
neu berechnete Koordinaten: 
(927.84, 1135.2)
Parent Element: g
transform: translate(16.8)
neu berechnete Koordinaten: 
(944.64, 1135.2)
Parent Element: g
transform: matrix(1.3333333,0,0,-1.3333333,0,2781.7333)
neu berechnete Koordinaten: 
(1259.519968512, 1268.1333378399997)
Parent Element: svg
Parent Element: [document]
Endgültige Startkoordinatne für path Element: 
(1259.519968512, 1268.1333378399997)
<svg:path d="M 0,0 V 4.32 H -4.8 V 4.8 H 0 v 4.32 0 V 4.8 H 4.8 V