# Introduction

Downloads and prepares the **Inria Aerial Image Labeling** dataset, used for pretraining.

https://project.inria.fr/aerialimagelabeling/

In [1]:
import glob, os, pandas as pd, shutil, matplotlib.pyplot as plt
from PIL import Image
import image_slicer
from pathlib import Path

In [2]:
# Paths
from constants import DATADIR, INRIA_DATA

In [3]:
INRIA_DATA.mkdir(exist_ok=True, parents=True)

# Download data

In [4]:
downloaded=True

We base our download method on the script `getAerial.sh`:

In [5]:
if not downloaded: 
    !wget https://files.inria.fr/aerialimagelabeling/getAerial.sh

In [8]:
if not downloaded: 
    !cat getAerial.sh

#!/bin/bash

which 7z >/dev/null || {
	echo 'you need 7z ; plz install it'
	echo 'ubuntu: sudo apt install p7zip-full'
	echo 'centos: sudo yum install p7zip p7zip-pluginsi -y'
	exit 1
}
which unzip >/dev/null || {
	echo 'you need unzip command ; plz install it'
	echo 'ubuntu: sudo apt install unzip'
	echo 'centos: sudo yum install zip unzip -y'
	exit 2
}
wget --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.001
wget --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.002
wget --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.003
wget --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.004
wget --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.005
7z x aerialimagelabeling.7z.001
unzip NEW2-AerialImageDataset.zip
rm -i aerialimagelabeling.7z.* 
rm -i NEW2-AerialImageDataset.zip


We run the steps manually below as we want to make sure we're not redownloading or re-extracting anything every time we run the notebook. 

In [6]:
if not downloaded:
    if os.path.exists(INRIA_DATA/'AerialImageDataset'):
        print('Already extracted')
    else:
        !wget -nc --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.001 -P $INRIA_DATA
        !wget -nc --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.002 -P $INRIA_DATA
        !wget -nc --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.003 -P $INRIA_DATA
        !wget -nc --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.004 -P $INRIA_DATA
        !wget -nc --no-check-certificate https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.005 -P $INRIA_DATA
        !7z x -aos $INRIA_DATA/aerialimagelabeling.7z.001 -o.$INRIA_DATA
        !unzip -n $INRIA_DATA/NEW2-AerialImageDataset.zip -d $INRIA_DATA
        #    !rm $INRIA_DATA/aerialimagelabeling.7z.* 
        #    !rm $INRIA_DATA/NEW2-AerialImageDataset.zip

In [18]:
INRIA_TRAIN_DATA = INRIA_DATA/'AerialImageDataset'/'train'

In [19]:
list(INRIA_TRAIN_DATA.iterdir())

[PosixPath('/home/alex/data/mapai/external/inria_aerialimagelabeling/AerialImageDataset/train/gt'),
 PosixPath('/home/alex/data/mapai/external/inria_aerialimagelabeling/AerialImageDataset/train/images')]

# Slice the images into 500x500 patches

To conform with the MapAI dataset, we slice the images in 500x500 patches and store the patches as PNGs. We use the `image-slicer` library: https://github.com/samdobson/image_slicer

In [20]:
PNG = INRIA_DATA/'AerialImageDataset'/'train'/'png'
PNG.mkdir(exist_ok=True, parents=True)

(PNG/'images').mkdir(exist_ok=True)
(PNG/'gt').mkdir(exist_ok=True)

In [21]:
import multiprocessing as mp
from functools import partial

In [22]:
def split_to_tiles(fn):
    """
    Input: path to an image
    Produces sliced images stored as PNGs
    """
    try:
        image_slicer.slice(fn, 100)
        path = "/".join((fn.split("/")[:-1]))
        kind = fn.split("/")[-2]
        current_fn = fn.split("/")[-1].split(".")[0]
        pngs = glob.glob(path + "/" + current_fn +"*.png")
        for png in pngs:
            shutil.move(png, PNG/kind)
    except:
        print()
        print(f"Error in {fn}!")
        print()

In [23]:
train_val = 'train'

In [24]:
img_list = glob.glob(f'{str(INRIA_DATA)}/AerialImageDataset/{train_val}/images/*.tif')
mask_list = glob.glob(f'{str(INRIA_DATA)}/AerialImageDataset/{train_val}/gt/*.tif')

In [25]:
pool = mp.Pool(processes=32)

In [26]:
pool.map(split_to_tiles, img_list);

In [27]:
pool.map(split_to_tiles, mask_list);

In [14]:
img_list = sorted([str(img) for img in list((PNG/'images').glob('*.png'))])
mask_list = [str(fn).replace('images', 'gt') for fn in img_list]

In [15]:
train_data = {'image': img_list, 'mask': mask_list, 'is_val':False}
train_df_inria = pd.DataFrame.from_dict(train_data)

In [29]:
def get_location(path):
    fn = path.split("/")[-1]
    return fn.split(".")[0].split("_")[0]

In [32]:
get_location('/home/ubuntu/data/mapai/external/inria_aerialimagelabeling/AerialImageDataset/train/png/images/austin10_01_01.png')

'austin10'

In [33]:
train_df_inria['location'] = train_df_inria.image.apply(get_location)

In [34]:
train_df_inria.head()

Unnamed: 0,image,mask,is_val,location
0,/home/ubuntu/data/mapai/external/inria_aeriali...,/home/ubuntu/data/mapai/external/inria_aeriali...,False,austin10
1,/home/ubuntu/data/mapai/external/inria_aeriali...,/home/ubuntu/data/mapai/external/inria_aeriali...,False,austin10
2,/home/ubuntu/data/mapai/external/inria_aeriali...,/home/ubuntu/data/mapai/external/inria_aeriali...,False,austin10
3,/home/ubuntu/data/mapai/external/inria_aeriali...,/home/ubuntu/data/mapai/external/inria_aeriali...,False,austin10
4,/home/ubuntu/data/mapai/external/inria_aeriali...,/home/ubuntu/data/mapai/external/inria_aeriali...,False,austin10


In [35]:
from datetime import date
today = date.today()

In [37]:
train_df_inria.to_csv(f'../csv/inria_aerial_image_dataset_train-{today}.csv', index=False)

# Inspect data

In [None]:
from plotting import plot_image_and_masks_from_df

In [23]:
from ipywidgets import interact, interactive, IntSlider, Select, RadioButtons, fixed, BoundedIntText

In [24]:
interactive_plot = interactive(plot_image, df=fixed(train_df_inria),
                               imgidx = BoundedIntText(min=0, max=len(train_df_inria)-1, step=1, value=0),
                               figsize = BoundedIntText(min=4, max=12, step=1, value=6),
                               with_segm= RadioButtons(options=[True,False], value=True, 
                                                      description="With segmentation"))

output = interactive_plot.children[-1]

In [25]:
interactive_plot

interactive(children=(BoundedIntText(value=0, description='imgidx', max=17999), BoundedIntText(value=6, descri…