In [1]:
import json
from pathlib import Path
import shutil

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
import rioxarray as rxr
from shapely.geometry import Polygon, box
from tqdm import tqdm

from utils.helpers import get_rgb_channels

root_dir = Path('/datadrive/hand_labels_processing')
json_dir =root_dir / 'json_export'
img_input_dir = root_dir / 'import_candidates/input_128px'
img_dir = root_dir / 'images_import'
recut_dir = root_dir / 'sharp_images'
shp_dir = root_dir / 'shp_labels'

In [2]:

def print_structure(data, indent=0):
    for key, value in data.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict):
            print_structure(value, indent+1)
        elif isinstance(value, list) and isinstance(value[0], dict) and len(value) > 0:
            print_structure(value[0], indent+1)

# for file in json_dir.iterdir():
#     with open(file) as f:
#         data = json.load(f)

#     print_structure(data[0])
#     break


In [3]:
def get_image_names(json_data):
    input_images = []
    for task in json_data:
        img_id = task["file_upload"]
        parts = img_id.split('-', 1) # Split the string at the first occurrence of '-'
        new_string = parts[1] if len(parts) > 1 else parts[0]
        input_images.append(new_string)
    return input_images


def move_files(input_images, img_input_dir, img_dir):
    for img_name in input_images:
        
        tif_name = Path(img_name).with_suffix('.TIF')
        src_tif_file = img_input_dir / "images" / tif_name
        for sub_dir, name in [("png", img_name), ("images", tif_name)]:
            src_file = img_input_dir / sub_dir / name
            if src_file.exists():
                dst_file = img_dir / sub_dir / name
                dst_file.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(src_file, dst_file)
            else:
                print(f'File {src_file} not found.')


def extract_name_from_task(task):
    img_id = task["file_upload"]
    parts = img_id.split('-', 1) # Split the string at the first occurrence of '-'
    return parts[1] if len(parts) > 1 else parts[0]


def get_gdf_from_result(result, img, crs):
    tile_size = int(result["original_width"]) # make sure tiles are square
    points = result["value"]["points"]
    
    points = [img.rio.transform() * (tile_size / 100 * np.array(point))  for point in points]
    # draw.polygon(points, outline ="red")

    polygon = Polygon(points)
    label = result["value"]["polygonlabels"][0]
    gdf = gpd.GeoDataFrame({'image_name': [img.name], 'image_bounds': [box(*img.rio.bounds())], 'geometry': [polygon], 'label': [label]}, geometry='geometry', crs=img.rio.crs)
    gdf = gdf.to_crs(crs)
    return gdf


  
def convert_task_to_gdf(task, crs, img_dir):
    img_name = extract_name_from_task(task)

    tif_img_dir = img_dir / 'images'
    img_path = (tif_img_dir / img_name).with_suffix('.TIF')
    img = rxr.open_rasterio(img_path, default_name=img_path.name)
    if img.rio.crs != crs:
        img = img.rio.reproject(crs)
        print(f"Reprojecting {img_name}")

    gdf_list = [get_gdf_from_result(result, img, crs) 
                for annotation in task["annotations"] 
                for result in annotation["result"] 
                if result]
    if gdf_list:
        return pd.concat(gdf_list, ignore_index=True)
    else:
        print(f"No labels could be found for task {img_name}.")
    


In [4]:
# img_name = '23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r3_c2.png'
# r4c2_task = [task for task in data if extract_name_from_task(task) == img_name][0]
# tif_img_dir = Path('/datadrive/hand_labels/images_import/images')
# img_path = (tif_img_dir / img_name).with_suffix('.TIF')
# img = rxr.open_rasterio(img_path, default_name=img_path.name)
# if img.rio.crs != crs:
#     img = img.rio.reproject(crs)
#     print(img)
# print(img.rio.bounds())

In [5]:
file_gdfs = []
crs = "EPSG:32616"
for file in tqdm(json_dir.iterdir()):
    print(f"Processing {file.name}")    
    with open(file) as f:
        data = json.load(f)
        input_images = get_image_names(data)
        print("Fetching images")    
        # move_files(input_images, img_input_dir=img_input_dir, img_dir=img_dir)

        print("Converting to geoDataframe")
        gdf_list = [convert_task_to_gdf(task, crs, img_dir) for task in data]
        file_gdfs.append(pd.concat(gdf_list, ignore_index=True))
gdf = pd.concat(file_gdfs, ignore_index=True)
# gdf.drop(columns=["image_bounds"]).to_file(shp_dir / "labeled_buildings.shp") # labels geoDF
images_gdf = gdf.set_geometry('image_bounds').groupby(['image_name','image_bounds']).count().reset_index().set_geometry('image_bounds').set_crs(crs)




0it [00:00, ?it/s]

Processing dora-at-2024-01-15-16-42-2d24b852.json
Fetching images
Converting to geoDataframe
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r0_c0.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r0_c1.png
No labels could be found for task 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r0_c1.png.
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r0_c2.png
No labels could be found for task 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r0_c2.png.
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r0_c3.png


1it [00:03,  3.52s/it]

Processing frederike-at-2024-02-02-15-56-8f0455d4.json
Fetching images
Converting to geoDataframe


2it [00:09,  5.03s/it]

Processing mert-at-2024-01-31-23-06-6e6eec71.json
Fetching images
Converting to geoDataframe
No labels could be found for task 23JUL28162837-M3DS_R3C1-016043586010_01_P001_r4_c2_r2_c1.png.
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r1_c0.png
No labels could be found for task 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r1_c0.png.
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r1_c1.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r1_c2.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r1_c3.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r2_c2.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r2_c3.png


3it [00:19,  7.33s/it]

Processing michael-at-2024-01-29-15-31-50b48d2e.json
Fetching images
Converting to geoDataframe
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r2_c0.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r2_c1.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r2_c2.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r2_c3.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r1_c1.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r1_c2.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r1_c3.png


4it [00:29,  8.40s/it]

Processing nathan-at-2024-02-06-17-54-a0e43c3c.json
Fetching images
Converting to geoDataframe
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r3_c1.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r3_c2.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r3_c3.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r0_c0.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r0_c1.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r0_c2.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r0_c3.png
Reprojecting 23JUN18160657-M3DS_R4C2-016144386010_01_P001_r5_c1_r3_c0.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r3_c1.png
Reprojecting 23JUN18160713-M3DS_R2C1-016144385010_01_P001_r7_c7_r3_c3.png


5it [00:45,  9.02s/it]


In [6]:
sat_images = images_gdf.image_name.apply(lambda x: '_'.join(x.split('_')[:4])).unique()
sat_img_dir = Path('/datadrive/merged_PS/images')
images_gdf['new_image_name'] = None
for file_name in sat_images:
    file_path = (sat_img_dir / file_name.replace('M3DS', 'PS3DS')).with_suffix('.TIF')
    if file_path.exists():
        print (f"Found {file_path}")

        raster = rxr.open_rasterio(file_path)
        if raster.rio.crs != crs:
            raster = raster.rio.reproject(crs)
        matching_rows = images_gdf[images_gdf.image_name.str.startswith(file_name)]
        for index, row in matching_rows.iterrows():
            # Get the bounds for cropping
            bounds = row.image_bounds.bounds
            minx, miny, maxx, maxy = bounds
            print("Bounds:", bounds)
            
            # Crop the image using the bounds
            cropped_image = raster.rio.clip_box(minx=minx, miny=miny, maxx=maxx, maxy=maxy)
            print("Cropped image bounds:", cropped_image.rio.bounds())
            
            # Define the new image name and path
            new_image_name = row.image_name.replace('M3DS', 'PS3DS').replace('.TIF', '_RECUT.TIF')
            new_image_path = recut_dir / new_image_name
            
            # Save the cropped image
            cropped_image.rio.to_raster(new_image_path)
            
            # Update the new_image_name column in the GeoDataFrame
            images_gdf.at[index, 'new_image_name'] = new_image_name

    else:
        print(f"{file_path} not found")


Found /datadrive/merged_PS/images/23JUL28162830-PS3DS_R3C3-016043585010_01_P001.TIF
Bounds: (260704.4946420193, 1520739.4401419163, 260866.8277871609, 1520901.7732870579)
Cropped image bounds: (260704.4946420193, 1520739.4401419163, 260866.8277871609, 1520901.7732870579)
Bounds: (260866.8277871609, 1520739.4401419163, 261029.1609323025, 1520901.7732870579)
Cropped image bounds: (260866.8277871609, 1520739.4401419163, 261029.1609323025, 1520901.7732870579)
Bounds: (260704.4946420193, 1520577.1069967747, 260866.8277871609, 1520739.4401419163)
Cropped image bounds: (260704.4946420193, 1520577.1069967747, 260866.8277871609, 1520739.4401419163)
Bounds: (260866.8277871609, 1520577.1069967747, 261029.1609323025, 1520739.4401419163)
Cropped image bounds: (260866.8277871609, 1520577.1069967747, 261029.1609323025, 1520739.4401419163)
Bounds: (261029.1609323025, 1520577.1069967747, 261191.4940774441, 1520739.4401419163)
Cropped image bounds: (261029.1609323025, 1520577.1069967747, 261191.49407744

In [7]:
model_dir = Path("/datadrive/dev_versions/hand_labels_model")

if not model_dir.exists():
    model_dir.mkdir()
    (model_dir / "images").mkdir()
    (model_dir / "labels").mkdir()

for file_path in recut_dir.iterdir():
    dest_path = model_dir / "images" / file_path.name
    shutil.copy2(file_path, dest_path)

gdf.drop(columns=["image_bounds"]).to_file(model_dir / "labels/hand_labels.shp")


Bad pipe message: %s [b'\n\x85\xbb_\x08\x9e\xef\xc9\x97\x9ag\x8c\x17-~\x0b\x1e\xb4 ~\xac=\xc0\xbf\xd9\xd1m\x07\xdd\xeb\x15\x9f\xa5z\xde<X\x047\xfez\xd9]-M\x10+\xbev\\[\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.']
Bad pipe message: %s [b'0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03']
Bad pipe message: %s [b'\x06\x03\x08\x07', b'\x08\t\x08\n\x08\x0b\x08']
Bad pipe message: %s [b'\x05\x08\x06']
Bad pipe message: %s [b'\x05\x01\x06', b'']
Bad pipe message: %s [b'WSS\x98\x8f\xc6d\n&\xddb\x03\xc7\x882\x80A\x11\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0', b"+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9

In [3]:
labels_path = Path("/datadrive/dev_versions/hand_labels_model/labels/hand_labels.shp")
labels = gpd.read_file(labels_path)



Unnamed: 0,image_name,label,geometry
0,23JUL28162837-M3DS_R3C1-016043586010_01_P001_r4_c2_r3_c3.TIF,Building,"POLYGON ((263359.349 1519989.593, 263364.613 1519989.932, 263364.952 1519986.197, 263366.141 1519986.027, 263366.442 1519983.138, 263368.171 1519982.994, 263368.603 1519980.042, 263359.385 1519980.042, 263359.349 1519989.593))"
1,23JUL28162837-M3DS_R3C1-016043586010_01_P001_r4_c2_r3_c3.TIF,Building,"POLYGON ((263379.971 1519984.424, 263385.490 1519984.478, 263386.139 1519973.766, 263378.672 1519973.657, 263378.564 1519976.363, 263375.913 1519976.633, 263376.075 1519978.256, 263377.915 1519978.419, 263378.023 1519980.258, 263380.187 1519980.312, 263379.971 1519984.424))"
2,23JUL28162837-M3DS_R3C1-016043586010_01_P001_r4_c2_r3_c3.TIF,Building,"POLYGON ((263326.973 1519977.587, 263336.225 1519977.695, 263336.442 1519972.609, 263326.865 1519972.609, 263326.973 1519977.587))"
3,23JUL28162837-M3DS_R3C1-016043586010_01_P001_r4_c2_r3_c3.TIF,Building,"POLYGON ((263355.974 1520018.147, 263363.057 1520018.266, 263363.116 1520016.421, 263363.831 1520016.123, 263364.069 1520012.492, 263358.950 1520012.135, 263359.010 1520010.171, 263356.093 1520010.171, 263355.974 1520018.147))"
4,23JUL28162837-M3DS_R3C1-016043586010_01_P001_r4_c2_r3_c3.TIF,Building,"POLYGON ((263349.155 1520029.652, 263351.611 1520029.890, 263351.849 1520026.325, 263354.067 1520026.087, 263354.225 1520024.661, 263348.918 1520024.741, 263349.155 1520029.652))"


In [18]:
summary_df = pd.DataFrame({
'number_tiles': [len(labels.image_name.unique())],
'buildings_count': [labels.label.count()],
'building_class': [labels.groupby('label').geometry.count().Building],
'dense_building_class': [labels.groupby('label').geometry.count()["dense building"]]})

In [19]:
summary_df

Unnamed: 0,number_tiles,buildings_count,building_class,dense_building_class
0,64,4851,4610,241


Bad pipe message: %s [b'k\xed\x05\xdb_j\xa7+\xd0H\xcb\xb5Y\xbaP\x82\xc8R \x13\x9c\x8c\xb2~\x02[I\xee#']
Bad pipe message: %s [b'%\xccs\xf1<', b'\x96D6\xe9:\xb0\x88Z\x8d\xb0\xdc \xd6\t']
Bad pipe message: %s [b"F\x8fd\xc7\xf8\x8b\x1eW\x97i\t\x17\xc7\x91V\ry\x0c\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0", b'\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00']
Bad pipe message: %s [b'\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08']
Bad pipe message: %s [b"TE\xf4\x04\xb4!/\xfb\xf7\xbb\xebo