# **PRE-PROCESS**


 
In the pre-process we execute : 
1.   Get tier1 and tier2 data sets from Open Cities AI challenge
2.   Generate the labels from Geojson files
3.   Split large Tiff images into  frames of 512 x 512 

<img src="https://s3.amazonaws.com/drivendata-public-assets/opendri_kam_4e7c7f.png" width=500 height=400>






In [0]:
# libraries installation

!pip install geopandas
!apt-get install python-numpy
!pip install solaris
!pip install rio-tiler
!pip install rasterio
!pip install descartes
!pip install pystac
!pip install pandas

In [0]:
from pathlib import Path

# define directories
ROOT = '/content'
DATA_PATH = PATH(ROOT + '/train')
IMAGE_PATH = PATH(DATA_PATH + '/image')
LABEL_PATH = PATH(DATA_PATH + '/label')

# create directories for data
DATA_PATH.mkdir(exist_ok= True)
IMAGE_PATH.mkdir(exist_ok= True)
LABEL_PATH.mkdir(exist_ok= True)

In [0]:
from pystac import STAC_IO
from urllib.parse import urlparse
from pystac import (Catalog, CatalogType, Item, Asset, LabelItem, Collection)
from pathlib import Path  
import requests

# overwriting read method of STAC_IO lib so it can handle https links
def my_read(uri):
    if urlparse(uri).scheme.startswith('http'):
        return requests.get(uri).text
    else:
        return STAC_IO.default_read_text_method(uri)

STAC_IO.read_text_method = my_read

# Getting data from competition (in catalogs)
# tier1 data set
train_1 = Catalog.from_file('https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/catalog.json')
# tier2 data set
train_2 = Catalog.from_file('https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_2/catalog.json')

In [0]:
# make a dict of collections for tier1 data set and tier2 data set
train_1_col = {train_1_col.id:train_1_col for train_1_col in train_1.get_children()}
train_2_col = {train_2_col.id:train_2_col for train_2_col in train_2.get_children()}

# Generate areas for tier1
areas_1 = []
for col in train_1_col:
    items = [x for x in train_1_col[col].get_all_items()]
    for i, id in enumerate(items):
        if i % 2 == 0 and i + 1 < len(items):
            areas.append((col, items[i].id, items[i+1].id))

# Generate areas for tier2
areas_2 = []
for col in train_2_col:
    items = [x for x in train_2_col[col].get_all_items()]
    for i, id in enumerate(items):
        if i % 2 == 0 and i + 1 < len(items):
            areas.append((col, items[i].id, items[i+1].id))

In [0]:
import geopandas as gpd
import numpy as np
import pandas as pd
import solaris as sol
import rasterio
from rasterio.transform import from_bounds
from shapely.geometry import Polygon
from rio_tiler import main as rt_mai
import skimage
import os
from rasterio.windows import Window
import pandas as pd

for area, image, label in areas_1: # CHANGE TO areas_2 TO GET TIER_2 DATA SET. 
                                   # NOT ENOUGH DISK SPACE TO PROCESS THE 2 DATA SETS AT THE SAME TIME
  
  items = cols[area].get_item(id=image)

  # Load shapefile of each label
  label_tmp = cols[area].get_item(id=label)
  geo_data_frame = geopandas.read_file(label_tmp.make_asset_hrefs_absolute().assets['labels'].href)

  # Get polygons from geodataframe
  polygons = geo_data_frame.geometry

  # Get outlines and zoom on each frame
  polygon_geo = Polygon(items.to_dict()['geometry']['coordinates'][0])
  polygon = geopandas.GeoDataFrame(index=[0], crs=geo_data_frame.crs, geometry=[polygon_geo])   
  polygon['geometry'].to_file(image+'.geojson', driver='GeoJSON')
  !cat {image}.geojson | supermercado burn {zoom_level} | mercantile shapes | fio collect > {img_id}{zoom_level}frames.geojson
  
  # Load frames and add convenience column
  frames = geopandas.read_file(f'{image}{zoom_level}frames.geojson')
  frames['xyz'] = frames.id.apply(lambda x: x.lstrip('(,)').rstrip('(,)').split(','))
  frames['xyz'] = [[int(q) for q in p] for p in frames['xyz']]

  # Frame url
  frame_url = items.assets['image'].href
  
# SAVE FRAMES
  x,y,z = frames['xyz']
  frame, mask = rt_main.frame(frame_url, x,y,z, tilesize=512) # our model input is 512 x 512
    
  skimage.io.imsave(f'{IMAGE_PATH}/{prefix}{z}_{x}_{y}.png',np.moveaxis(frame,0,2), check_contrast=False) # save in path

# SAVE MASKS
  frame_polygon = frame['geometry']
  tfm = from_bounds(*frame_polygon.bounds, tile_size, tile_size) 
  
  #cropped polygons
  polygons_crop = [poly for poly in labels_poly if poly.intersects(frame_polygon)]
  polygons_crop_gdf = geopandas.GeoDataFrame(geometry=polygons_crop, crs=4326)
  
  # RGB mask
  fbc_mask = sol.vector.mask.df_to_px_mask(df=polygons_crop_gdf,
                                         channels=['footprint', 'boundary', 'contact'],
                                         affine_obj=tfm, shape=(tile_size,tile_size),
                                         boundary_width=5, boundary_type='inner', contact_spacing=5, meters=True)
  
  
  skimage.io.imsave(f'{LABEL_PATH}/{prefix}{z}_{x}_{y}_mask.png',fbc_mask, check_contrast=False) 


# **TRAIN**

**The Train images are stored as large Cloud Optimized GeoTiffs (COG)**

* All images include 4 bands: red, green, blue and alpha.

* Spatial resolution varies from region to region

Image with Label: 

![alt text](https://lh3.googleusercontent.com/KI2eZWRkmqgdWBs1W4ZRcy7djEAuViSuMwWJRxuE5NiIW42cQJpYyM8JSwHjw9Tf16PS=s400)

In [0]:
# B. Basic data folder structure
import os

DATA_PATH = '/content/train'
FRAME_PATH = DATA_PATH + '/image'
MASK_PATH = DATA_PATH + '/label'


In [0]:
######################## ONE RUN PER RESET ONLY #############################
# C. Sort and shuffle frames and masks

# Importing required libraries
import os, random, re
from PIL import Image

# Gathering all frames and masks names
all_frames = os.listdir(FRAME_PATH)
all_masks = os.listdir(MASK_PATH)

# Sort frames and masks
all_frames.sort(key=lambda var:[(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)])
all_masks.sort(key=lambda var:[(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)])

# Shuffle frames and masks (with the same seed)
random.seed(150)
random.shuffle(all_frames)
random.seed(150)
random.shuffle(all_masks)


In [0]:
# D. Renaming real file names to incremental numerical file names, eg: from 
#    'e96av3.png' to '0.png' (0-based format names)
#    name_dict associates numerical file names to real file names
counter = 0
name_dict = {}
for test_file in os.listdir(DATA_PATH + '/test/image/'):
    name_dict[str(test_file)] = str(counter)
    os.rename(DATA_PATH + '/test/image/' + str(test_file), DATA_PATH + '/test/image/' + str(counter) + '.png')
    counter = counter + 1

In [0]:
# D.2 Exporting name dict to csv (optional)
import csv
print("Saving test file name table to a file.")
w = csv.writer(open("test_file_names.csv", "w"))
for key, val in name_dict.items():
    w.writerow([key, val])

In [0]:
# D.3 Reading name dict from csv (optional)
import csv
with open('test_file_names.csv', mode='r') as infile:
    reader = csv.reader(infile)
    name_dict = {rows[0]:rows[1] for rows in reader}    

In [0]:
# E. Training dataset
import numpy as np 
import os
import skimage.io as io
import skimage.transform as trans
import numpy as np
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras import backend as keras
from keras.preprocessing.image import ImageDataGenerator

#model
def unet(weights_file_path):
    inputs = Input((512, 512, 3))
    conv1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(inputs)
    conv1 = BatchNormalization()(conv1)
    conv1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv1)
    conv1 = BatchNormalization()(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
    
    conv2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool1)
    conv2 = BatchNormalization()(conv2)
    conv2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv2)
    conv2 = BatchNormalization()(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
    
    conv3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool2)
    conv3 = BatchNormalization()(conv3)
    conv3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv3)
    conv3 = BatchNormalization()(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
    
    conv4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool3)
    conv4 = BatchNormalization()(conv4)
    conv4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv4)
    conv4 = BatchNormalization()(conv4)
    drop4 = Dropout(0.5)(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(drop4)

    conv5 = Conv2D(1024, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool4)
    conv5 = BatchNormalization()(conv5)
    conv5 = Conv2D(1024, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv5)
    conv5 = BatchNormalization()(conv5)
    drop5 = Dropout(0.5)(conv5)

    up6 = Conv2D(512, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(drop5))
    up6 = BatchNormalization()(up6)
    merge6 = concatenate([drop4,up6], axis = 3)
    conv6 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge6)
    conv6 = BatchNormalization()(conv6)
    conv6 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv6)
    conv6 = BatchNormalization()(conv6)

    up7 = Conv2D(256, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv6))
    up7 = BatchNormalization()(up7)
    merge7 = concatenate([conv3,up7], axis = 3)
    conv7 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge7)
    conv7 = BatchNormalization()(conv7)
    conv7 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv7)
    conv7 = BatchNormalization()(conv7)

    up8 = Conv2D(128, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv7))
    up8 = BatchNormalization()(up8)
    merge8 = concatenate([conv2,up8], axis = 3)
    conv8 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge8)
    conv8 = BatchNormalization()(conv8)
    conv8 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv8)
    conv8 = BatchNormalization()(conv8)

    up9 = Conv2D(64, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv8))
    up9 = BatchNormalization()(up9)
    merge9 = concatenate([conv1,up9], axis = 3)
    conv9 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge9)
    conv9 = BatchNormalization()(conv9)
    conv9 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv9)
    conv9 = BatchNormalization()(conv9)
    conv9 = Conv2D(2, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv9)
    conv9 = BatchNormalization()(conv9)    
    conv10 = Conv2D(1, 1, activation = 'sigmoid')(conv9)

    model = Model(input = inputs, output = conv10)
    model.compile(optimizer = Adam(lr = 1e-4), loss = 'binary_crossentropy', metrics = ['accuracy'])
    if(os.path.exists(weights_file_path)): 
        model.load_weights(pretrained_weights)
        print("Weights file found. Updating existing network weights.")
    else:
        print("No weights file found. Starting training from scratch.")
    return model


def train(batch_size, path):
    #Data augmentation for our training set
    augmentation = dict(rotation_range=0.5, width_shift_range=0.1, height_shift_range=0.1,
                    shear_range=0.1, zoom_range=0.1, horizontal_flip=True, fill_mode='nearest')

    image_datagen = ImageDataGenerator(**augmentation)
    mask_datagen = ImageDataGenerator(**augmentation)

    image_generator = image_datagen.flow_from_directory(
        path, classes = ['image'], class_mode = None, color_mode = "rgb",
        target_size = (512, 512), batch_size = batch_size, save_to_dir = None,
        save_prefix  = "image", seed = 1)

    mask_generator = mask_datagen.flow_from_directory(
        path, classes = ['label'], class_mode = None, color_mode = "grayscale",
        target_size = (512, 512), batch_size = batch_size, save_to_dir = None,
        save_prefix  = "mask", seed = 1)
    
    for (img,mask) in zip(image_generator, mask_generator):
        if(np.max(img) > 1):
            img /= 255
            mask /= 255
            mask = mask > 0.5
        yield (img,mask)

# defining paths
DATA_PATH = '/content/train'
WEIGHTS_PATH  = '/content/checkpoint.hdf5'

# defining train params
EPOCHS = 5, BATCH_SIZE = 16
STEPS_PER_EPOCH = (len(os.listdir(DATA_PATH + '/image'))//BATCH_SIZE)

# actual training
tr = train(BATCH_SIZE, DATA_PATH + '/')
m = model(WEIGHTS_PATH)
model_checkpoint = ModelCheckpoint(WEIGHTS_PATH, monitor='loss', verbose=1, save_best_only=True)
m.fit_generator(tr, steps_per_epoch=STEPS_PER_EPOCH, epochs=EPOCHS, callbacks=[model_checkpoint])

# **TEST**

**The test set contains 11.481 1024x1024 COG fragments derived from scenes that are not present in the training set.**

**The submission file must contain a building footprint mask for each chip in the test set. Each mask must have the same name as its corresponding imagery chip in the test set**

Mask Preview : 

![alt text](https://i1.wp.com/opendri.org/wp-content/uploads/2020/02/segment.png?w=400&ssl=1)





---

**Import "checkpoint.hdf5" manually**



---



In [0]:
######################## ONE RUN PER RESET ONLY ################################
# A. Downloading Test dataset with 1024x1024 TIF images
import shutil, os

!wget https://drivendata-public-assets.s3.amazonaws.com/test.tgz # downloading
!tar -xf test.tgz                                                # extracting
if os.path.exists('/content/sample_data'):                       # removing sample_data
    shutil.rmtree('/content/sample_data')  

os.remove('/content/test.tgz')                                   #removing test.tgz                        

In [0]:
# B. Changing directory names
import os

if not os.path.exists('/content/test/image'):
    os.rename('/content/test', '/content/image')
    os.mkdir('/content/test')
    shutil.move('/content/image', '/content/test')

In [0]:
######################## ONE RUN PER RESET ONLY ################################
# C. Moving images from subdirectories to root directory 
import os

DATA_PATH = '/content/test/image' #root directory


for folder in os.listdir(DATA_PATH):
    if os.path.isdir(DATA_PATH + '/' + folder):
        shutil.move(DATA_PATH + '/' + folder + '/' + folder + '.tif', DATA_PATH)
        shutil.rmtree(DATA_PATH + '/' + folder)

print('Number of Images : ' + len(os.listdir(DATA_PATH)))
print(os.listdir(DATA_PATH))

In [0]:
######################## ONE RUN PER RESET ONLY ################################
# D. Renaming real file names to incremental numerical file names, eg: from 
#    'e96av3.png' to '0.png' (0-based format names)

file_names = {}
counter = 0
for file in os.listdir(DATA_PATH):
        file_names[counter] = file
        os.rename(DATA_PATH + '/' + str(file) , DATA_PATH + '/' + str(counter) + '.png')
        counter += 1
        
print(os.listdir(DATA_PATH))

In [0]:
# E. Removing 4th channel (alpha channel) from TIF images (This may take a while...)
import cv2, os
for test_file in os.listdir(DATA_PATH):
    img = cv2.imread(DATA_PATH + '/' + test_file, 1) # reading images
    if not img is None:  # if image is readable, saves it as a 3-channel image
        cv2.imwrite(DATA_PATH + '/' + test_file, img)
    else: # if image file is corrupted (broken), remove file altogether
        os.remove(DATA_PATH + '/' + test_file)

In [0]:
# F. Runs prediction model and saves the results
import skimage.io as io
import numpy as np
import skimage.transform as transform
from skimage import img_as_ubyte
import os

# defines a test generator
def test(path):
    for i in range(os.listdir(path)):
        img = io.imread(os.path.join(path,"%d.png"%i), as_gray = True)
        img = img / 255
        img = transform.resize(img,(512, 512))
        img = np.reshape(img,(1,)+img.shape)
        yield img

# define paths
WEIGHTS_PATH = '/content/checkpoint.hdf5'
OUTPUT_PATH = '/content/test/results'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)

# pick mode, create a test generator and make predictions
m = model(WEIGHTS_PATH)
te = test(DATA_PATH)
results = model.predict_generator(te, len(os.listdir(DATA_PATH)), verbose = 1)

# saving results
for i,item in enumerate(results):
    img_old = item[:,:,0]
    img = img_old > 0.5
    io.imsave(os.path.join(path,"%d_predict.png"%i),img_as_ubyte(img))
    img = io.imread(os.path.join(path,"%d_predict.png"%i),img_as_ubyte(img))

In [0]:
######################## ONE RUN PER RESET ONLY ################################
# G. Post procesing - Resizing final images from 512x512 back to 1024x1024
import cv2, os, ast

# define submissions path
SUBMISSIONS_PATH = '/content/submissions'
if not os.path.exists(SUBMISSIONS_PATH): os.mkdir(SUBMISSIONS_PATH)

# resizing + renaming
for file in os.listdir(OUTPUT_PATH):
    if file.endswith('.png'):
        img = cv2.imread(OUTPUT_PATH + '/' + file, cv2.IMREAD_UNCHANGED)
        resized = cv2.resize(img, (1024, 1024), interpolation = cv2.INTER_AREA)
        cv2.imwrite(SUBMISSIONS_PATH + '/' + file_names[int(file[:file.index('_')])] , resized)