In [None]:
from ml_project.constants import paths,data

In [None]:
from ml_project.dataset_creation import utils

In [None]:
import numpy as np
import shapely.geometry

# Defining Variables

In [None]:
batch_size = 1000 # number of random coordinates
epochs = 1

# Draw random points on european land

In [None]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

In [None]:
excluded_countries = ['Iceland','Russia']

In [None]:
# Read the Natural Earth Dataset of the Countries coarse
# https://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-admin-0-countries/
countries_110m = gpd.read_file(paths.NE_110M_COUNTRIES_SHP)
# select only the eu and exclude countries
countries_eu_110m = countries_110m[countries_110m['CONTINENT']=='Europe'].copy()
drop_countries_mask = countries_eu_110m['NAME_EN'].isin(excluded_countries)
countries_eu_110m.drop(index=countries_eu_110m[drop_countries_mask].index,inplace=True)

In [None]:
# Read the Natural Earth Dataset of the Countries detailed
# https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/
countries_10m = gpd.read_file(paths.NE_10M_COUNTRIES_SHP)
# select only the eu and exclude countries
countries_eu_10m = countries_10m[countries_10m['CONTINENT']=='Europe'].copy()
drop_countries_mask = countries_eu_10m['NAME_EN'].isin(excluded_countries)
countries_eu_10m.drop(index=countries_eu_10m[drop_countries_mask].index,inplace=True)

In [None]:
def draw_points_in_countries(rng,batch_size=batch_size,countries_df=countries_eu_110m):
    global combined_countries_boundary
    
    # draw random points
    print('Drawing random points')
    lon = rng.uniform(low=data.LON_MIN,high=data.LON_MAX,size=batch_size)
    lat = rng.uniform(low=data.LAT_MIN,high=data.LAT_MAX,size=batch_size)
    points = [Point(lon,lat) for lon,lat in zip(lon,lat)]
    points_df = gpd.GeoDataFrame(geometry=points,crs='EPSG:4326')
    
    # check if they are on european land
    print('Checking if the points are on land')
    if not 'combined_countries_boundary' in globals():
        combined_countries_boundary = countries_df['geometry'].unary_union
    points_df['in_eu'] = points_df.within(combined_countries_boundary)
    
    # check in which country they are
    print('Assign a country to all points on land')
    points_df['country'] = np.nan
    for i, country in countries_df.iterrows():
        not_assigned = points_df['in_eu'] & points_df['country'].isnull()
        in_country = not_assigned & points_df.loc[not_assigned,'geometry'].within(country['geometry'])
        points_df.loc[in_country,'country'] = country['NAME_EN']
    
    # load current max ids
    if paths.FILE_POINTS.is_file():
        points_df_old = pd.read_csv(paths.FILE_POINTS)
        max_point_id = points_df_old['id'].max()
        max_batch_id = points_df_old['batch_id'].max()
    else:
        max_point_id = -1
        max_batch_id = -1
    
    # assign ids
    points_df['id'] = points_df.index + max_point_id + 1
    points_df['batch_id'] = max_batch_id + 1
    
    return points_df

In [None]:
points_df = draw_points_in_countries(np.random.default_rng(seed=0))

In [None]:
points_df.head(5)

In [None]:
def plot_points(points_df,countries_df):
    ax = countries_df.plot(figsize=(10,10),column='MAPCOLOR7',alpha=0.3)
    points_df.loc[~points_df['in_eu']].plot(ax=ax,color='k',alpha=0.2,marker='x',markersize=10)
    points_df.plot(ax=ax,column='country',markersize=10)

    ax.set_xlim(data.LON_MIN,data.LON_MAX)
    ax.set_ylim(data.LAT_MIN,data.LAT_MAX)

In [None]:
plot_points(points_df,countries_eu_110m)

# Check if the resulting Tile is already downloaded

In [None]:
import mercantile

In [None]:
def mark_new_tiles(points_df):
    # calculate the corresponding tiles
    points_df.loc[:,['tile_x','tile_y','tile_z']] = -1
    for i in points_df.loc[points_df['in_eu']].index:
        tile = mercantile.tile(
            lng=points_df.loc[i,'geometry'].x,
            lat=points_df.loc[i,'geometry'].y,
            zoom=data.Z)
        points_df.loc[i,'tile_x'] = tile.x
        points_df.loc[i,'tile_y'] = tile.y
        points_df.loc[i,'tile_z'] = tile.z
        
    # check if Tile exists already in the dataset
    points_df.loc[:,'new_tile'] = False
    if paths.FILE_SAMPLES.is_file():
            samples_df = pd.read_csv(paths.FILE_SAMPLES)
            columns_to_check = ['tile_x','tile_y','tile_z']
            new_tile = False
            for col in columns_to_check:
                new_tile |= ~points_df.loc[points_df['in_eu'],col].isin(samples_df.loc[:,col])
            points_df.loc[points_df['in_eu'],'new_tile'] = new_tile
    else:
        points_df.loc[points_df['in_eu'],'new_tile'] = True

In [None]:
mark_new_tiles(points_df)

In [None]:
points_df

# Check if the new Tiles contain water

but not only water

In [None]:
def get_inner_bbox_in_outer(x_i,y_i,z_i,x_o,y_o,z_o):
    # pixel width and height of the outer vector tile
    extent_o = data.VECTOR_TILE_EXTENT
    # pixel width and height of one inner tile
    extent_i = extent_o/(2**(z_i-z_o))
    # top left Tile in the outer Tile but with the inner zoom
    # https://wiki.openstreetmap.org/wiki/Slippy_map_tilenames#Subtiles
    x_o_with_z_i = 2**(z_i-z_o) * x_o
    y_o_with_z_i = 2**(z_i-z_o) * y_o
    # pixel coordinates of the top left corner of the inner tile in the vector tile
    px_tl = extent_i * (x_i - x_o_with_z_i)
    py_tl = extent_i * (y_i - y_o_with_z_i)
    # bottom right corner of the inner tile in vector tile pixel coordinates
    px_br = px_tl + extent_i
    py_br = py_tl + extent_i
    # shapely Polygon
    bbox = shapely.geometry.box(px_tl, py_tl, px_br, py_br)
    return bbox

In [None]:
def get_accepted_samples_df(points_df):
    points_df['accepted_water'] = False
    points_df[['outer_tile_x','outer_tile_y','outer_tile_z']] = -1
    columns_to_copy = ['id','tile_x','tile_y','tile_z']
    samples_df = points_df.loc[points_df['new_tile'],['id','tile_x','tile_y','tile_z']].copy()

    for i in points_df[points_df['new_tile']].index:
        # inner Tile
        x_i = points_df.loc[i,'tile_x']
        y_i = points_df.loc[i,'tile_y']
        z_i = points_df.loc[i,'tile_z']
        # calculate outer Tile
        z_o = data.Z_OUTER
        tile_o = mercantile.parent(x_i,y_i,z_i,zoom=z_o)
        x_o = tile_o.x
        y_o = tile_o.y
        points_df.loc[i,'outer_tile_x'] = x_o
        points_df.loc[i,'outer_tile_y'] = y_o
        points_df.loc[i,'outer_tile_z'] = z_o
        # get water information of the outer tile
        water_df = utils.get_water_df(x_o,y_o,z_o)
        # check if the outer vector tile has water
        if water_df is None:
            continue
        # check if in water is in the inner Tile
        bbox_pixel = get_inner_bbox_in_outer(x_i,y_i,z_i,x_o,y_o,z_o)
        water_polygon = water_df.iloc[0,0]
        if water_polygon.overlaps(bbox_pixel):
            points_df.loc[i,'accepted_water'] = True
        else:
            samples_df.drop(i,inplace=True)
            
    if samples_df.empty:
        raise RuntimeError('No new and accepted samples were found and the samples_df is empty.')
    
    return samples_df

In [None]:
samples_df = get_accepted_samples_df(points_df)

In [None]:
points_df

In [None]:
samples_df

In [None]:
def add_sample_geography(samples_df,countries_df):
    for i in samples_df.index:
        x = samples_df.loc[i,'tile_x']
        y = samples_df.loc[i,'tile_y']
        z = samples_df.loc[i,'tile_z']
        tile = mercantile.Tile(x,y,z)
        # get bbox in geodetic coordinates
        bounds = mercantile.bounds(tile)
        bbox = shapely.geometry.box(bounds.west,bounds.south,bounds.east,bounds.north)
        samples_df.loc[i,'geometry'] = bbox
        # get center of tile in geodetic coordinates
        centroid = bbox.centroid
        lon = centroid.x
        lat = centroid.y
        samples_df.loc[i,'lon'] = lon
        samples_df.loc[i,'lat'] = lat
        # get country of the center point
        country_mask = countries_df.contains(centroid)
        if country_mask.any():
            samples_df.loc[i,'country'] = countries_df.loc[country_mask,'NAME_EN'].values[0]

In [None]:
add_sample_geography(samples_df,countries_eu_10m)

In [None]:
samples_df

In [None]:
def plot_samples(samples_df,countries_df):
    ax = countries_df.plot(figsize=(10,10),column='MAPCOLOR7',alpha=0.3)
    ax.scatter(samples_df['lon'],samples_df['lat'])

    ax.set_xlim(data.LON_MIN,data.LON_MAX)
    ax.set_ylim(data.LAT_MIN,data.LAT_MAX)

In [None]:
plot_samples(samples_df,countries_eu_110m)

# Saving the CSV files

In [None]:
def save_points_df(points_df):
    if paths.FILE_POINTS.is_file():
        points_df.to_csv(paths.FILE_POINTS,mode='a', header=False, index=False)
    else:
        points_df.to_csv(paths.FILE_POINTS,mode='w', header=True,  index=False)

In [None]:
def save_samples_df(samples_df):
    if paths.FILE_SAMPLES.is_file():
        samples_df.to_csv(paths.FILE_SAMPLES,mode='a', header=False, index=False)
    else:
        samples_df.to_csv(paths.FILE_SAMPLES,mode='w', header=True,  index=False)

In [None]:
save_points_df(points_df)
save_samples_df(samples_df)

# Download the Image Pairs

In [None]:
from ml_project.constants import mapbox

In [None]:
def download_images(samples_df):
    for i in samples_df.index:
        x = samples_df.loc[i,'tile_x']
        y = samples_df.loc[i,'tile_y']
        z = samples_df.loc[i,'tile_z']
        # download the satellite image
        utils.download_file(file_path=paths.FILE_SATELLITE_IMAGE(x,y,z),
                            url=mapbox.URL_SATELLITE(x,y,z))
        # download the mask image
        utils.download_file(file_path=paths.FILE_MASK_IMAGE(x,y,z),
                            url=mapbox.URL_MASK(x,y,z))

In [None]:
download_images(samples_df)

In [None]:
from IPython.display import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image

In [None]:
# https://stackoverflow.com/questions/50559000/how-do-i-make-2-images-appear-side-by-side-in-jupyter-notebook-ipython
def show_samples(samples_df):
    for i in samples_df.index:
        x = samples_df.loc[i,'tile_x']
        y = samples_df.loc[i,'tile_y']
        z = samples_df.loc[i,'tile_z']
        img_satellite = plt.imread(paths.FILE_SATELLITE_IMAGE(x,y,z),format=data.SATELLITE_FORMAT)
        img_mask = plt.imread(paths.FILE_MASK_IMAGE(x,y,z),format=data.MASK_FORMAT)
        fig, ax = plt.subplots(1,2,figsize=(8,4))
        ax[0].imshow(img_satellite)
        ax[0].set_xticks([])
        ax[0].set_yticks([])
        ax[1].imshow(img_mask)
        ax[1].set_xticks([])
        ax[1].set_yticks([])
        plt.tight_layout()
        plt.show()

In [None]:
show_samples(samples_df)