In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! mkdir building_segmentation
! mkdir building_segmentation/training
! mkdir building_segmentation/training/input
! mkdir building_segmentation/training/output
! mkdir building_segmentation/testing
! mkdir building_segmentation/testing/input
! mkdir building_segmentation/testing/output
! mkdir building_segmentation/validation
! mkdir building_segmentation/validation/input
! mkdir building_segmentation/validation/output

In [3]:
# code from https://github.com/BBarbosa/tflearn-image-recognition-toolkit/blob/4a0528dcfb206b1e45997f2fbc097aafacfa0fa0/scripts/html_link_parser.py

import re
import argparse

from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
from skimage import io as skio
from urllib.request import urlopen
import os


def html_url_parser(url, save_dir, show=False, wait=False):
    """
    HTML parser to download images from URL.
    Params:\n
    `url` - Image url\n
    `save_dir` - Directory to save extracted images\n
    `show` - Show downloaded image\n
    `wait` - Press key to continue executing
    """

    website = urlopen(url)
    html = website.read()

    soup = BeautifulSoup(html, "html5lib")

    for image_id, link in enumerate(soup.find_all("a", href=True)):
        if image_id == 0:
            continue

        img_url = link["href"]

        try:
            if os.path.isfile(save_dir + "img-%d.png" % image_id) == False:
                print("[INFO] Downloading image from URL:", link["href"])
                image = Image.open(urlopen(img_url))
                image.save(save_dir + "img-%d.png" % image_id, "PNG")
                if show:
                    image.show()
            else:
                print("skipped")
        except KeyboardInterrupt:
            print("[EXCEPTION] Pressed 'Ctrl+C'")
            break
        except Exception as image_exception:
            print("[EXCEPTION]", image_exception)
            continue

        if wait:
            key = input("[INFO] Press any key to continue ('q' to exit)... ")
            if key.lower() == "q":
                break


# ///////////////////////////////////////////////////
#                   Main method
# ///////////////////////////////////////////////////
if __name__ == "__main__":
    URL_TRAIN_IMG = (
        #"https://www.cs.toronto.edu/~vmnih/data/mass_roads/train/sat/index.html"
        "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat/index.html"
    )
    URL_TRAIN_GT = (
        #"https://www.cs.toronto.edu/~vmnih/data/mass_roads/train/map/index.html"
        "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/map/index.html"
    )
    URL_VALID_IMG = (
        "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/valid/sat/index.html"
    )
    URL_VALID_GT = (
        "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/valid/map/index.html"
    )

    URL_TEST_IMG = (
        #"https://www.cs.toronto.edu/~vmnih/data/mass_roads/valid/sat/index.html"
        "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/test/sat/index.html"
    )
    URL_TEST_GT = (
        #"https://www.cs.toronto.edu/~vmnih/data/mass_roads/valid/map/index.html"
        "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/test/map/index.html"
    )

    html_url_parser(url=URL_TRAIN_IMG, save_dir="./building_segmentation/training/input/")
    html_url_parser(url=URL_TRAIN_GT, save_dir="./building_segmentation/training/output/")
    
    html_url_parser(url=URL_VALID_IMG, save_dir="./building_segmentation/validation/input/")
    html_url_parser(url=URL_VALID_GT, save_dir="./building_segmentation/validation/output/")

    html_url_parser(url=URL_TEST_IMG, save_dir="./building_segmentation/testing/input/")
    html_url_parser(url=URL_TEST_GT, save_dir="./building_segmentation/testing/output/")

    print("[INFO] All done!")

[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat//22678930_15.tiff
[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat//22678945_15.tiff
[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat//22678960_15.tiff
[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat//22678975_15.tiff
[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat//22678990_15.tiff
[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat//22679005_15.tiff
[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat//22679020_15.tiff
[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat//22679035_15.tiff
[INFO] Downloading image from URL: http://www.cs.toronto.edu/~vmnih/data

In [4]:
! du -hs building_segmentation

694M	building_segmentation


In [None]:
import glob
all_testing_img = glob.glob('/content/building_segmentation/testing/input/*')
all_testing_mask = glob.glob('/content/building_segmentation/testing/output/*')
all_validation_img = glob.glob('/content/building_segmentation/validation/input/*')
all_validation_mask = glob.glob('/content/building_segmentation/validation/output/*')
all_training_img = glob.glob('/content/building_segmentation/training/input/*')
all_training_mask = glob.glob('/content/building_segmentation/training/output/*')

print(len(all_testing_img))
print(len(all_testing_mask))
print(len(all_validation_img))
print(len(all_validation_mask))
print(len(all_training_img))
print(len(all_training_mask))

In [5]:
! zip -r building_segmentation.zip building_segmentation

  adding: building_segmentation/ (stored 0%)
  adding: building_segmentation/validation/ (stored 0%)
  adding: building_segmentation/validation/input/ (stored 0%)
  adding: building_segmentation/validation/input/img-3.png (deflated 0%)
  adding: building_segmentation/validation/input/img-1.png (deflated 0%)
  adding: building_segmentation/validation/input/img-2.png (deflated 0%)
  adding: building_segmentation/validation/output/ (stored 0%)
  adding: building_segmentation/validation/output/img-3.png (deflated 0%)
  adding: building_segmentation/validation/output/img-1.png (deflated 0%)
  adding: building_segmentation/validation/output/img-2.png (deflated 2%)
  adding: building_segmentation/testing/ (stored 0%)
  adding: building_segmentation/testing/input/ (stored 0%)
  adding: building_segmentation/testing/input/img-4.png (deflated 0%)
  adding: building_segmentation/testing/input/img-6.png (deflated 0%)
  adding: building_segmentation/testing/input/img-5.png (deflated 0%)
  adding: b

In [6]:
! cp building_segmentation.zip /content/drive/'My Drive'/

In [7]:
! du -hs building_segmentation.zip

694M	building_segmentation.zip
