# What's in this Notebook?

The code which generates the dataset of random Image pairs.  
But not the inspection of this dataset.  

First for every step a function is defined. (in the order of operation)  
Then those functions are put together in a for loop at the end.

# Imports

In [None]:
from ml_project.constants import paths,data,mapbox
from ml_project.utils import files

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests

In [None]:
import geopandas as gpd
import shapely.geometry
from shapely.geometry import Point

In [None]:
import mercantile

In [None]:
# for progress bars
from tqdm.notebook import tqdm

# Define Variables

In [None]:
batch_size = 1000 # number of random coordinates
epochs = 10 # number batches

# Load the needed external data

In [None]:
def load_ne_datasets():
    global countries_eu_110m, countries_eu_10m
    countries_eu_110m = files.load_ne_countries_eu_110m()
    countries_eu_10m = files.load_ne_countries_eu_10m()

# Draw random points on european land

In [None]:
def draw_points_in_countries(rng,batch_size):
    global countries_eu_110m,combined_countries_boundary
    # draw random points
    lon = rng.uniform(low=data.LON_MIN,high=data.LON_MAX,size=batch_size)
    lat = rng.uniform(low=data.LAT_MIN,high=data.LAT_MAX,size=batch_size)
    points = [Point(lon,lat) for lon,lat in zip(lon,lat)]
    points_df = gpd.GeoDataFrame(geometry=points,crs='EPSG:4326')
    
    # check if they are on european land
    if not 'combined_countries_boundary' in globals():
        combined_countries_boundary = countries_eu_110m['geometry'].unary_union
    points_df['in_eu'] = points_df.within(combined_countries_boundary)
    
    # check in which country they are
    points_df['country'] = np.nan
    for i, country in countries_eu_110m.iterrows():
        not_assigned = points_df['in_eu'] & points_df['country'].isnull()
        in_country = not_assigned & points_df.loc[not_assigned,'geometry'].within(country['geometry'])
        points_df.loc[in_country,'country'] = country['NAME_EN']
    
    # load current max ids
    if paths.FILE_POINTS.is_file():
        points_df_old = pd.read_csv(paths.FILE_POINTS)
        max_point_id = points_df_old['id'].max()
        max_batch_id = points_df_old['batch_id'].max()
    else:
        max_point_id = -1
        max_batch_id = -1
    
    # assign ids
    points_df['id'] = points_df.index + max_point_id + 1
    points_df['batch_id'] = max_batch_id + 1
    
    return points_df

# Check if the resulting Tile is already downloaded

In [None]:
def mark_new_tiles(points_df):
    # calculate the corresponding tiles
    points_df.loc[:,['tile_x','tile_y','tile_z']] = -1
    for i in points_df.loc[points_df['in_eu']].index:
        tile = mercantile.tile(
            lng=points_df.loc[i,'geometry'].x,
            lat=points_df.loc[i,'geometry'].y,
            zoom=data.Z)
        points_df.loc[i,'tile_x'] = tile.x
        points_df.loc[i,'tile_y'] = tile.y
        points_df.loc[i,'tile_z'] = tile.z
        
    # check if Tile exists already in the dataset
    points_df.loc[:,'new_tile'] = False
    if paths.FILE_SAMPLES.is_file():
            samples_df = pd.read_csv(paths.FILE_SAMPLES)
            columns_to_check = ['tile_x','tile_y','tile_z']
            new_tile = False
            for col in columns_to_check:
                new_tile |= ~points_df.loc[points_df['in_eu'],col].isin(samples_df.loc[:,col])
            points_df.loc[points_df['in_eu'],'new_tile'] = new_tile
    else:
        points_df.loc[points_df['in_eu'],'new_tile'] = True

# Check if the new Tiles contain water and is accepted

but not only water

In [None]:
def get_inner_bbox_in_outer(x_i,y_i,z_i,x_o,y_o,z_o):
    # pixel width and height of the outer vector tile
    extent_o = data.VECTOR_TILE_EXTENT
    # pixel width and height of one inner tile
    extent_i = extent_o/(2**(z_i-z_o))
    # top left Tile in the outer Tile but with the inner zoom
    # https://wiki.openstreetmap.org/wiki/Slippy_map_tilenames#Subtiles
    x_o_with_z_i = 2**(z_i-z_o) * x_o
    y_o_with_z_i = 2**(z_i-z_o) * y_o
    # pixel coordinates of the top left corner of the inner tile in the vector tile
    px_tl = extent_i * (x_i - x_o_with_z_i)
    py_tl = extent_i * (y_i - y_o_with_z_i)
    # bottom right corner of the inner tile in vector tile pixel coordinates
    px_br = px_tl + extent_i
    py_br = py_tl + extent_i
    # shapely Polygon
    bbox = shapely.geometry.box(px_tl, py_tl, px_br, py_br)
    return bbox

In [None]:
def is_enough_water_in_tile(water_polygon,bbox,z_i,z_o):
    if water_polygon.intersects(bbox):
        intersection_area = water_polygon.intersection(bbox).area
        extent_o = data.VECTOR_TILE_EXTENT
        extent_i = extent_o/(2**(z_i-z_o))
        area_tile = extent_i**2
        water_ratio = intersection_area/area_tile
        if data.WATER_RATIO_MIN <= water_ratio <= data.WATER_RATIO_MAX:
            return True
    
    return False

In [None]:
def get_accepted_samples_df(points_df,session=None):
    points_df['accepted_water'] = False
    points_df[['outer_tile_x','outer_tile_y','outer_tile_z']] = -1
    columns_to_copy = ['id','tile_x','tile_y','tile_z']
    samples_df = points_df.loc[points_df['new_tile'],['id','tile_x','tile_y','tile_z']].copy()

    for i in tqdm(points_df[points_df['new_tile']].index, desc='Checking Point: ', leave=False):
        # inner Tile
        x_i = points_df.loc[i,'tile_x']
        y_i = points_df.loc[i,'tile_y']
        z_i = points_df.loc[i,'tile_z']
        # calculate outer Tile
        z_o = data.Z_OUTER
        tile_o = mercantile.parent(x_i,y_i,z_i,zoom=z_o)
        x_o = tile_o.x
        y_o = tile_o.y
        points_df.loc[i,'outer_tile_x'] = x_o
        points_df.loc[i,'outer_tile_y'] = y_o
        points_df.loc[i,'outer_tile_z'] = z_o
        # get water information of the outer tile
        water_df = files.load_water_df(x_o,y_o,z_o,session=session)
        # check if the outer vector tile has water
        if water_df is None:
            samples_df.drop(i,inplace=True)
            continue
        # check if in water is in the inner Tile
        bbox_pixel = get_inner_bbox_in_outer(x_i,y_i,z_i,x_o,y_o,z_o)
        water_polygon = water_df.iloc[0,0]
        if is_enough_water_in_tile(water_polygon,bbox_pixel,z_i,z_o):
            points_df.loc[i,'accepted_water'] = True
        else:
            samples_df.drop(i,inplace=True)
            
    if samples_df.empty:
        return None
    
    return samples_df

# Add additional information to the dataset

In [None]:
def add_sample_geography(samples_df):
    global countries_eu_10m
    for i in samples_df.index:
        x = samples_df.loc[i,'tile_x']
        y = samples_df.loc[i,'tile_y']
        z = samples_df.loc[i,'tile_z']
        tile = mercantile.Tile(x,y,z)
        # get bbox in geodetic coordinates
        bounds = mercantile.bounds(tile)
        bbox = shapely.geometry.box(bounds.west,bounds.south,bounds.east,bounds.north)
        samples_df.loc[i,'geometry'] = bbox
        # get center of tile in geodetic coordinates
        centroid = bbox.centroid
        lon = centroid.x
        lat = centroid.y
        samples_df.loc[i,'lon'] = lon
        samples_df.loc[i,'lat'] = lat
        # get country of the center point
        country_mask = countries_eu_10m.contains(centroid)
        if country_mask.any():
            samples_df.loc[i,'country'] = countries_eu_10m.loc[country_mask,'NAME_EN'].values[0]

# Download the Image Pairs

In [None]:
def download_images(samples_df,session=None):
    for i in tqdm(samples_df.index, desc='Downloading Sample: ', leave=False):
        x = samples_df.loc[i,'tile_x']
        y = samples_df.loc[i,'tile_y']
        z = samples_df.loc[i,'tile_z']
        # download the satellite image
        files.download_file(file_path=paths.FILE_SATELLITE_IMAGE(x,y,z),
                            url=mapbox.URL_SATELLITE(x,y,z),session=session)
        # download the mask image
        files.download_file(file_path=paths.FILE_MASK_IMAGE(x,y,z),
                            url=mapbox.URL_MASK(x,y,z),session=session)

# Put it all together and create or extend the dataset

In [None]:
load_ne_datasets()
rng = files.load_random_state(seed=42)

for epoch in tqdm(range(epochs), desc='Epoch: '):
    bar = tqdm(total=7, desc='Step: ', leave=False)
    
    session = requests.Session()
    
    bar.set_postfix_str('draw random points');bar.update(n=1)
    points_df = draw_points_in_countries(rng, batch_size)
    
    bar.set_postfix_str('mark new tiles');bar.update(n=1)
    mark_new_tiles(points_df)
    
    bar.set_postfix_str('accept or reject samples');bar.update(n=1)
    samples_df = get_accepted_samples_df(points_df,session)
    
    if samples_df is not None:
        bar.set_postfix_str('add geography information');bar.update(n=1)
        add_sample_geography(samples_df)
        
        bar.set_postfix_str('download the images');bar.update(n=1)
        download_images(samples_df,session)
        
        print(f'Epoch Finished: {epoch} ; New Samples: {len(samples_df)} ; Points on land: {points_df["in_eu"].sum()}')
    else:
        bar.update(n=2)
        print(f'Epoch Finished: {epoch} ; No new and accepted samples')
    
    bar.set_postfix_str('save the data files');bar.update(n=1)
    if samples_df is not None: 
        files.save_samples_df(samples_df)
        
    files.save_points_df(points_df)
    
    files.save_random_state(rng)
    bar.update(n=1)
    bar.close()