# 1.4. Construct labelled dataset
---
Last edited: 05/08/2022 by Jeongkyung Won<br>
This code does: 
* load all 2㎢ clipped map images and the corresponding grid-level dataset 
* construct two dataframes and merge them using the grid id: </br> 
* randomly sample 1,000 images by each class of the dummy and save them in the local environment.

### 1.4.1. Load Data
___
I will load 1)clipped map images dataset 2) stata dta file that contains information on grid-level population, landcover, and whether the grid intersects with national road in 2020.

In [1]:
# Define Dropbox Function

import requests
import zipfile
from urllib.parse import unquote # for url string conversion
import re
from tqdm import tqdm

def download_dropbox(url, folder):
  ## version 2.0, last modified by Hyunjoo Yang (hyang@sogang.ac.kr) on Jan. 14 2022
  ## This function downloads dropbox shareable link to a local folder (tested for file downloading, but not for shared folder!)
  
  # url: dropbox shareable link for downloading
  # folder: where to download
  
  headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
  r = requests.get(url, stream=True, headers=headers)
  
  # convert dropbox shareable link ("dl=0" to "dl=1")
  url = url.replace("?dl=0", "?dl=1")

  # check if the url returns valid status code (200)
  if r.status_code == 200:
    print('The url is valid.')

    # grab filename from the url, using regular expressions (and replace space to "_" )
    file_name = unquote(re.search(r'\/([^\/]+\.([\w]+))\?dl=([01])$', url).group(1)).replace(" ", "_")

    folder_n_fname = os.path.join(folder, file_name)

    # download
    print('Begin downloading < {} >'.format(folder_n_fname))


    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    t = tqdm(total=total_size, unit='B', unit_scale=True)
    with open(file_name, 'wb') as f:
      for data in r.iter_content(block_size):
        t.update(len(data))
        f.write(data)
    t.close()

    print('Finished downloading < {} >'.format(folder_n_fname))

    # unzip if zip file
    if file_name.endswith('.zip'):
      print('Extracting zip file...')
      with zipfile.ZipFile(file_name,"r") as zip_ref:
        zip_ref.extractall(folder)
    else:
      if not os.path.exists(folder):
        os.makedirs(folder)
      shutil.move(file_name, folder_n_fname)

    print('Done!')
  else:
    raise ValueError('Nothing to download: dropbox link is not valid. Check the URL link again')

In [2]:
## dropbox shareable url links

import os

url_png = 'https://www.dropbox.com/s/wcsr3qnoqx17njo/black_to_transparent_2km_all_pruned.zip?dl=0' # NGII_1970_clipped_maps_2km_pruned
url_dta = 'https://www.dropbox.com/s/3qmrr6xcjz2j0ps/NGII_2km_pop_grid_16_intersect_natl_road.zip?dl=0' # NGII_2km_pop_grid_16_natl_road.dta

# file paths for download and unzip

png_path = './all_maps'
dta_path = './dta'

# download dropbox shared file and unzip it

download_dropbox(url_png, png_path)
download_dropbox(url_dta, dta_path)


The url is valid.
Begin downloading < ./all_maps/black_to_transparent_2km_all_pruned.zip >


100%|██████████| 6.81G/6.81G [13:12<00:00, 8.60MB/s]


Finished downloading < ./all_maps/black_to_transparent_2km_all_pruned.zip >
Extracting zip file...
Done!
The url is valid.
Begin downloading < ./dta/NGII_2km_pop_grid_16_intersect_natl_road.zip >


100%|██████████| 112k/112k [00:00<00:00, 321kB/s] 

Finished downloading < ./dta/NGII_2km_pop_grid_16_intersect_natl_road.zip >
Extracting zip file...
Done!





### 1.4.2. Construct two dataframes and merge them
---
 *df1: filepaths of map images, and their grid id </br>
 *df2: grid-level id, population, landcover, and dummy variable for whether the grid intersects national road </br>

In [3]:
# Collect All Map Images (df1)

from pathlib import Path
import os.path
import pandas as pd
import glob
import numpy as np
import shutil

image_dr=Path('./all_maps/black_to_transparent_2km_all_pruned/')

images_path=pd.Series(list(image_dr.glob(r'*.png')),name='filepath').astype(str)
len(images_path)

25449

In [4]:
# To add map id info
images_id=pd.Series(images_path.apply(lambda x: os.path.splitext(os.path.split(x)[1])[0]), name='id').astype(int)
images_id[:1000]

0      18525
1       5622
2       4151
3      23973
4       2710
       ...  
995    27793
996    18897
997    24447
998    25663
999    11529
Name: id, Length: 1000, dtype: int64

In [5]:
df1=pd.concat([images_path, images_id],axis=1).sample(frac=1.0,random_state=1).reset_index(drop=True)
df1

Unnamed: 0,filepath,id
0,all_maps/black_to_transparent_2km_all_pruned/7...,7009
1,all_maps/black_to_transparent_2km_all_pruned/1...,1500
2,all_maps/black_to_transparent_2km_all_pruned/2...,23977
3,all_maps/black_to_transparent_2km_all_pruned/1...,18659
4,all_maps/black_to_transparent_2km_all_pruned/6...,6281
...,...,...
25444,all_maps/black_to_transparent_2km_all_pruned/2...,22690
25445,all_maps/black_to_transparent_2km_all_pruned/1...,18965
25446,all_maps/black_to_transparent_2km_all_pruned/1...,14568
25447,all_maps/black_to_transparent_2km_all_pruned/2...,20671


In [6]:
# Read the stata .dta file (df2)

dta_dr='./dta/NGII_2km_pop_grid_16_intersect_natl_road.dta'

df2=pd.read_stata(dta_dr)
df2

Unnamed: 0,gid,grid_pop_16,landcover,w_nat_road
0,1,60,,0.0
1,2,0,,0.0
2,3,0,,0.0
3,4,0,,0.0
4,5,413,,0.0
...,...,...,...,...
27977,27978,7,Water,0.0
27978,27979,0,Water,0.0
27979,27980,10,Water,0.0
27980,27981,0,,0.0


In [7]:
# Merge two dataframes

df=pd.merge(df1, df2, left_on='id', right_on='gid', indicator=False, how='inner')
df["id"]=df["id"].astype(str)
df

Unnamed: 0,filepath,id,gid,grid_pop_16,landcover,w_nat_road
0,all_maps/black_to_transparent_2km_all_pruned/7...,7009,7009,14,Forest,0.0
1,all_maps/black_to_transparent_2km_all_pruned/1...,1500,1500,0,Water,0.0
2,all_maps/black_to_transparent_2km_all_pruned/2...,23977,23977,0,Water,0.0
3,all_maps/black_to_transparent_2km_all_pruned/1...,18659,18659,48,Forest,0.0
4,all_maps/black_to_transparent_2km_all_pruned/6...,6281,6281,142,Forest,0.0
...,...,...,...,...,...,...
25444,all_maps/black_to_transparent_2km_all_pruned/2...,22690,22690,428,Forest,0.0
25445,all_maps/black_to_transparent_2km_all_pruned/1...,18965,18965,21,Forest,0.0
25446,all_maps/black_to_transparent_2km_all_pruned/1...,14568,14568,23,Forest,0.0
25447,all_maps/black_to_transparent_2km_all_pruned/2...,20671,20671,88,Water,1.0


In [19]:
df.to_csv('./all_maps_w_road_info.csv')

### 1.4.3. Random sampling
---
After sampling 1,000 images by road indicator, save them in the local environment and manually check if their labels are correct. 

In [8]:
df["w_nat_road"].value_counts()

0.0    16612
1.0     8837
Name: w_nat_road, dtype: int64

In [9]:
df_wo_road=df.drop(df[df.w_nat_road==0].index)
wo_road=df_wo_road.sample(n=1000)

df_w_road=df.drop(df[df.w_nat_road==1].index)
w_road=df_w_road.sample(n=1000)

In [10]:
wo_road_id=list(wo_road['id'])
w_road_id=list(w_road['id'])

print(wo_road_id[:3],len(wo_road_id), w_road_id[:3], len(w_road_id))

['13561', '11090', '5821'] 1000 ['24484', '25502', '630'] 1000


In [16]:
# Set up directories & zipfile names (mount the drive)

dr_original = "./all_maps/black_to_transparent_2km_all_pruned/"

# where to move images w/ and w/o road
dr_wo_road = "./drive/MyDrive/Maps/labelled/wo_road/"
dr_w_road = "./drive/MyDrive/Maps/labelled/w_road/"

# name of zipfiles
name_zip_wo_road = "maps_wo_road" 
name_zip_w_road = "maps_w_road"

In [18]:
# wo_road

for png_id in wo_road_id :
    
    dr_png = dr_original + png_id + ".png"
    try : shutil.copy(dr_png, # original directory
                      dr_wo_road) # where to move
    except : continue
        
shutil.make_archive(name_zip_wo_road, # name of zipfile 
                    'zip', # compressing method
                    dr_wo_road) # folder to compress

'/content/maps_wo_road.zip'

In [17]:
# w_road

for png_id in w_road_id :
    
    dr_png = dr_original + png_id + ".png"
    try : shutil.copy(dr_png, # original directory
                      dr_w_road) # where to move
    except : continue
        
shutil.make_archive(name_zip_w_road, # name of zipfile 
                    'zip', # compressing method
                    dr_w_road) # folder to compress

'/content/maps_w_road.zip'