In [107]:
import os
import sys
# import tensorflow_hub as hub
import pandas as pd
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
# Import maskRCNN
sys.path.append(ROOT_DIR)
import geopandas as gpd
import DataProcessing as DP
import json
import skimage
from tqdm import tqdm
import rasterio
from itertools import product
from rasterio import windows
import DataProcessing as DP
from pathlib import Path
import rioxarray
import shapely
from rasterio.warp import calculate_default_transform, reproject, Resampling
import shutil
import json
import random

In [108]:
DATA_DIR = os.path.abspath("../../../../EthanBrewer/SN2_buildings_train_AOI_4_Shanghai/AOI_4_Shanghai_Train")
label_dir = os.path.join(DATA_DIR, "geojson", "buildings")
image_dir = os.path.join(DATA_DIR, "RGB-PanSharpen")

out_label_dir = os.path.join(ROOT_DIR, "datasets_SH")

In [109]:
image_dict = dict()
label_dict = dict()

for img in os.listdir(image_dir):
    if img.endswith(".tif"):
        image_name = os.path.splitext(img)[0].split("_")[-1]

        if image_name not in image_dict:
            image_dict[image_name] = os.path.join(image_dir, img)

for label in os.listdir(label_dir):
    if label.endswith(".geojson"):
        label_name = os.path.splitext(label)[0].split("_")[-1]

        if label_name not in label_dict:
            label_dict[label_name] = os.path.join(label_dir, label)


In [146]:
def generate_annotation(single_image_path, single_label_path, label):

    """
    :param single_image_path: image path
    :param polygeos: a list of shapely geometries
    :param label: object label
    :return: save the generated annotation files to annotations folder
    """

    try:
        img = rasterio.open(single_image_path)
        width = img.width
        height = img.height
    except:
        print(img)
        raise

    basename = os.path.basename(single_label_path)
    annotationfile = Path(out_label_dir) / 'annotations' / (basename.split('.')[0] + '.json')


    image_dict = {"image_path": single_image_path, "image_name": basename, "annotations": [], "width": width,
                      "height": height}
    
    label_gdf = gpd.read_file(single_label_path)
    polygeos = label_gdf.geometry.to_list()
    
    for idx in range(len(polygeos)):
        # assert isinstance(polygeos[idx], shapely.geometry.polygon.Polygon)

        label_dict = {"label": label}
        regions = dict()
        
        try:
            xs, ys = polygeos[idx].exterior.coords.xy
            
            rs = list()
            cs = list()
            for x,y in zip(xs,ys):
                r, c = img.index(x, y)
                new_r = narrow_num(r, 650)
                new_c = narrow_num(c, 650)
                
                rs.append(new_r)
                cs.append(new_c)
            
            regions['x'] = rs    
            regions['y'] = cs

        except:
            # only get geometry for the first polygon in multipolygon types
            coordslist = [list(i.exterior.coords.xy) for i in polygeos[idx].geoms]
            xs = coordslist[0][0].tolist()
            ys = coordslist[0][1].tolist()
            
            rs = list()
            cs = list()
            
            for x,y in zip(xs, ys):
                
                r, c = img.index(x, y)
                new_r = narrow_num(r, 650)
                new_c = narrow_num(c, 650)
                
                rs.append(new_r)
                cs.append(new_c)
            
            regions['x'] = rs    
            regions['y'] = cs
            
        label_dict['region'] = regions
        image_dict['annotations'].append(label_dict)

        with open(annotationfile, 'w') as js:
            json.dump(image_dict, js)
            
        


In [147]:
def narrow_num(num, bound):
    
    if num >= int(bound):
        num = int(bound) - 1
    if num < 0:
        num = 0
    
    return num
    

In [149]:
# Generating json annotation for each image

gen_annotation = False

while gen_annotation:
    
    gen_annotation = False

    for k in tqdm(image_dict):

        gdf = gpd.read_file(label_dict[k])
        if not gdf.empty:
            print("Working on {}".format(image_dict[k]))
            generate_annotation(image_dict[k], label_dict[k], "building")
            #dest = Path(out_label_dir)/"images"
            #shutil.copy(image_dict[k], dest)

In [151]:

def split_train_val(img_folder, split_rate=None):
      
    # split_rate is the percentage of training dataset among all data
    
    rpath = os.path.dirname(img_folder)

    train_path = os.path.join(rpath, "train")
    val_path = os.path.join(rpath, "val")

    # create training and val folder
    Path(train_path).mkdir(exist_ok=True, parents=True)
    Path(val_path).mkdir(exist_ok=True, parents=True)

    img_paths = [os.path.join(img_folder, f) for f in os.listdir(img_folder) if f.endswith(".tif")]
    # define the idx of images that are used as training dataset
    nt = len(img_paths) * split_rate
    tidxs = random.sample(range(0, len(img_paths)), int(nt))
    vidxs = [i for i in range(0, len(img_paths)) if i not in tidxs]

    trains_json = list()
    vals_json = list()

    for idx in tidxs:
        basename = os.path.basename(img_paths[idx]).split(".")[0].split("_")[-1]
        annotationfile = Path(out_label_dir)/ "annotations" / ('buildings_AOI_4_Shanghai_{}.json').format(basename)
        shutil.copy(img_paths[idx], train_path)
        trains_json.append(annotationfile)

    for idx in vidxs:
        basename = os.path.basename(img_paths[idx]).split(".")[0].split("_")[-1]
        annotationfile = Path(out_label_dir)/ "annotations" / ('buildings_AOI_4_Shanghai_{}.json').format(basename)
        shutil.copy(img_paths[idx], val_path)
        vals_json.append(annotationfile)

    agg_annotation(trains_json, train_path)
    agg_annotation(vals_json, val_path)

def agg_annotation(annote_list, despath):

    file_dict = dict()
    desfile = os.path.join(despath, "annotation.json")
    for file in annote_list:
        f = open(file)
        img_id = os.path.basename(file).split('.')[0]
        file_dict[img_id] = json.load(f)

    with open(desfile, 'w') as js:
        json.dump(file_dict, js)
        

        
# Call the code        
gen_train_val = False

while gen_train_val:
    gen_train_val = False
    split_train_val(dest, split_rate=0.8)