# 1.4. Construct training/test dataset
---
Last edited: 05/02/2022 by Jeongkyung Won<br>
This code does: 
* filters out map images that are always natural areas from all images 
* randomly sample images and label them : training dataset for CNN
* the rest becomes the pool for test dataset 


### 1.4.1. Load Data

In [1]:
# Define Dropbox Function

import requests
import zipfile
from urllib.parse import unquote # for url string conversion
import re
from tqdm import tqdm

def download_dropbox(url, folder):
  ## version 2.0, last modified by Hyunjoo Yang (hyang@sogang.ac.kr) on Jan. 14 2022
  ## This function downloads dropbox shareable link to a local folder (tested for file downloading, but not for shared folder!)
  
  # url: dropbox shareable link for downloading
  # folder: where to download
  
  headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
  r = requests.get(url, stream=True, headers=headers)
  
  # convert dropbox shareable link ("dl=0" to "dl=1")
  url = url.replace("?dl=0", "?dl=1")

  # check if the url returns valid status code (200)
  if r.status_code == 200:
    print('The url is valid.')

    # grab filename from the url, using regular expressions (and replace space to "_" )
    file_name = unquote(re.search(r'\/([^\/]+\.([\w]+))\?dl=([01])$', url).group(1)).replace(" ", "_")

    folder_n_fname = os.path.join(folder, file_name)

    # download
    print('Begin downloading < {} >'.format(folder_n_fname))


    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    t = tqdm(total=total_size, unit='B', unit_scale=True)
    with open(file_name, 'wb') as f:
      for data in r.iter_content(block_size):
        t.update(len(data))
        f.write(data)
    t.close()

    print('Finished downloading < {} >'.format(folder_n_fname))

    # unzip if zip file
    if file_name.endswith('.zip'):
      print('Extracting zip file...')
      with zipfile.ZipFile(file_name,"r") as zip_ref:
        zip_ref.extractall(folder)
    else:
      if not os.path.exists(folder):
        os.makedirs(folder)
      shutil.move(file_name, folder_n_fname)

    print('Done!')
  else:
    raise ValueError('Nothing to download: dropbox link is not valid. Check the URL link again')

In [18]:
## dropbox shareable url links

import os

url_png = 'https://www.dropbox.com/s/wcsr3qnoqx17njo/black_to_transparent_2km_all_pruned.zip?dl=0' # NGII_1970_clipped_maps_2km_pruned
url_grid = 'https://www.dropbox.com/s/rkosd9lbyy8biua/pop_grid_2km_5179.zip?dl=0' # cookie mold: NGII population grid 2km 
url_lc= 'https://www.dropbox.com/s/1daydxy05s42v7r/mosaic_output.zip?dl=0' # landcover classification raster file
 
# file paths for download and unzip

png_path = './all_maps'
grid_path = './pop_grid'
lc_path= './landcover'

# download dropbox shared file and unzip it

download_dropbox(url_png, png_path)
download_dropbox(url_grid, grid_path)
download_dropbox(url_lc, lc_path)

The url is valid.
Begin downloading < ./all_maps/black_to_transparent_2km_all_pruned.zip >


100%|██████████| 6.81G/6.81G [02:31<00:00, 44.9MB/s]


Finished downloading < ./all_maps/black_to_transparent_2km_all_pruned.zip >
Extracting zip file...
Done!


### 1.4.2. Filter out ever-natural areas from all maps

In [25]:
# Collect All Map Images

from pathlib import Path
import os.path
import pandas as pd
import glob
import numpy as np

image_dr=Path('./all_maps/black_to_transparent_2km_all_pruned/')

images_path=pd.Series(list(image_dr.glob(r'*.png')),name='filepath').astype(str)
len(images_path)

25449

In [33]:
# To add map id info
images_id=pd.Series(images_path.apply(lambda x: os.path.splitext(os.path.split(x)[1])[0]), name='id').astype(int)
images_id[:1000]

0       8813
1      10932
2      26153
3      20747
4      13855
       ...  
995    24350
996     8884
997     5324
998     2899
999    23902
Name: id, Length: 1000, dtype: int64

In [34]:
images_df=pd.concat([images_path, images_id],axis=1).sample(frac=1.0,random_state=1).reset_index(drop=True)
images_df

Unnamed: 0,filepath,id
0,all_maps/black_to_transparent_2km_all_pruned/1...,14722
1,all_maps/black_to_transparent_2km_all_pruned/2...,27822
2,all_maps/black_to_transparent_2km_all_pruned/2...,23942
3,all_maps/black_to_transparent_2km_all_pruned/2...,21532
4,all_maps/black_to_transparent_2km_all_pruned/6...,6592
...,...,...
25444,all_maps/black_to_transparent_2km_all_pruned/5...,5262
25445,all_maps/black_to_transparent_2km_all_pruned/1...,16109
25446,all_maps/black_to_transparent_2km_all_pruned/1...,19627
25447,all_maps/black_to_transparent_2km_all_pruned/1...,11043


In [None]:
pip install geopandas
pip install rasterio
pip install rasterstats

In [9]:
import pandas as pd
import geopandas as gpd
import rasterstats
import rasterio as rio
from rasterio.crs import CRS
from rasterio.plot import show
from rasterio.merge import merge
import time

In [28]:
dir_shp="./pop_grid/population_grid_combined_2km.shp"
dir_raster="./landcover/mosaic_output.tif"
dir_out="./drive/MyDrive/Maps/"

mask=gpd.read_file(dir_shp, encoding='euc-kr')
mask.head(3)

Unnamed: 0,id,r2_val_r,geometry
0,1,60,"POLYGON ((746000.000 2002000.000, 748000.000 2..."
1,2,0,"POLYGON ((746000.000 2000000.000, 748000.000 2..."
2,3,0,"POLYGON ((746000.000 1998000.000, 748000.000 1..."


In [None]:
# Run Zonal Statistics to obatain the dominant landcover class

raster=rio.open(dir_raster)
raster_array=raster.read(1)
affine=raster.transform

start_time = time.time() 

majority=rasterstats.zonal_stats(mask,raster_array,
                                 affine=affine, stats=['majority'],geojson_out = True)

majority_lc= []
i=0

while i< len(majority):
  majority_lc.append(majority[i]['properties'])
  i+=1

df=pd.DataFrame(majority_lc)
df.to_csv(dir_out+'pop_grid_2km_w_lc_v2.csv')
print("Running time : {}".format(time.time() - start_time)) # total time

In [29]:
"""
df=pd.read_csv(dir_out+ 'pop_grid_2km_w_lc_v2.csv', usecols=['id','_majority','r2_val_r'])
df

Unnamed: 0,id,r2_val_r,_majority
0,1,60,
1,2,0,
2,3,0,
3,4,0,
4,5,413,
...,...,...,...
27977,27978,7,7.0
27978,27979,0,7.0
27979,27980,10,7.0
27980,27981,0,0.0


In [35]:
maps_w_lc=pd.merge(images_df, df, left_on='id', right_on='id', indicator=False, how='inner')
maps_w_lc

Unnamed: 0,filepath,id,r2_val_r,_majority
0,all_maps/black_to_transparent_2km_all_pruned/1...,14722,171,3.0
1,all_maps/black_to_transparent_2km_all_pruned/2...,27822,170,3.0
2,all_maps/black_to_transparent_2km_all_pruned/2...,23942,177,3.0
3,all_maps/black_to_transparent_2km_all_pruned/2...,21532,93,3.0
4,all_maps/black_to_transparent_2km_all_pruned/6...,6592,63647,1.0
...,...,...,...,...
25444,all_maps/black_to_transparent_2km_all_pruned/5...,5262,60,3.0
25445,all_maps/black_to_transparent_2km_all_pruned/1...,16109,1118,3.0
25446,all_maps/black_to_transparent_2km_all_pruned/1...,19627,144,3.0
25447,all_maps/black_to_transparent_2km_all_pruned/1...,11043,170,3.0
