# 3. Classify images with road (entire South Korea)
---
Last edited: 04/30/2022 by Jeongkyung Won<br>
This code uses **all** 1970 NGII maps to classify images with road

In [3]:
# Common imports
import numpy as np
import os
import pandas as pd
import cv2
import time

import shutil
from pathlib import Path
import os.path

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.data import AUTOTUNE

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
"""
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
"""

# random seed
my_seed = 42

# to make this notebook's output stable across runs
np.random.seed(my_seed)

## 3.1. Load Data
---

#### 3.1.1. Labelled Images  

In [4]:
import requests
import zipfile
from urllib.parse import unquote # for url string conversion
import re
from tqdm import tqdm

def download_dropbox(url, folder):
  ## version 2.0, last modified by Hyunjoo Yang (hyang@sogang.ac.kr) on Jan. 14 2022
  ## This function downloads dropbox shareable link to a local folder (tested for file downloading, but not for shared folder!)
  # url: dropbox shareable link for downloading
  # folder: where to download
  
  headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
  r = requests.get(url, stream=True, headers=headers)
  
  # convert dropbox shareable link ("dl=0" to "dl=1")
  url = url.replace("?dl=0", "?dl=1")

  # check if the url returns valid status code (200)
  if r.status_code == 200:
    print('The url is valid.')

    # grab filename from the url, using regular expressions (and replace space to "_" )
    file_name = unquote(re.search(r'\/([^\/]+\.([\w]+))\?dl=([01])$', url).group(1)).replace(" ", "_")

    folder_n_fname = os.path.join(folder, file_name)

    # download
    print('Begin downloading < {} >'.format(folder_n_fname))


    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    t = tqdm(total=total_size, unit='B', unit_scale=True)
    with open(file_name, 'wb') as f:
      for data in r.iter_content(block_size):
        t.update(len(data))
        f.write(data)
    t.close()

    print('Finished downloading < {} >'.format(folder_n_fname))

    # unzip if zip file
    if file_name.endswith('.zip'):
      print('Extracting zip file...')
      with zipfile.ZipFile(file_name,"r") as zip_ref:
        zip_ref.extractall(folder)
    else:
      if not os.path.exists(folder):
        os.makedirs(folder)
      shutil.move(file_name, folder_n_fname)

    print('Done!')
  else:
    raise ValueError('Nothing to download: dropbox link is not valid. Check the URL link again')

In [5]:
## dropbox shareable url link

url = 'https://www.dropbox.com/s/wjssbvx18pkcu2q/clipped_maps_2km_labelled.zip?dl=0' # NGII_1970_clipped_maps_2km_labelled

# file path for download and unzip
labelled_path = './labelled_images'

# download dropbox shared file and unzip it
download_dropbox(url, labelled_path)

The url is valid.
Begin downloading < ./labelled_images/clipped_maps_2km_labelled.zip >


100%|██████████| 210M/210M [00:03<00:00, 66.7MB/s]


Finished downloading < ./labelled_images/clipped_maps_2km_labelled.zip >
Extracting zip file...
Done!


In [6]:
w_road_dr=Path('./labelled_images/with_road')
wo_road_dr=Path('./labelled_images/without_road')

In [7]:
w_road=pd.Series(list(w_road_dr.glob(r'*.png')),name='filepath').astype(str)
w_road_df=w_road.to_frame()
w_road_df

Unnamed: 0,filepath
0,labelled_images/with_road/13775.png
1,labelled_images/with_road/13248.png
2,labelled_images/with_road/13426.png
3,labelled_images/with_road/12377.png
4,labelled_images/with_road/12950.png
...,...
495,labelled_images/with_road/9899.png
496,labelled_images/with_road/14671.png
497,labelled_images/with_road/15143.png
498,labelled_images/with_road/12623.png


In [8]:
wo_road=pd.Series(list(wo_road_dr.glob(r'*.png')),name='filepath').astype(str)
wo_road_df=wo_road.to_frame()
wo_road_df

Unnamed: 0,filepath
0,labelled_images/without_road/22047.png
1,labelled_images/without_road/22372.png
2,labelled_images/without_road/190.png
3,labelled_images/without_road/8447.png
4,labelled_images/without_road/5576.png
...,...
495,labelled_images/without_road/226.png
496,labelled_images/without_road/23144.png
497,labelled_images/without_road/21839.png
498,labelled_images/without_road/11668.png


In [9]:
# Add Labels indicating whether the image contains any road
w_road_df['road']=1
wo_road_df['road']=0

In [10]:
train_images=pd.concat([w_road_df,wo_road_df],axis=0).sample(frac=1.0,random_state=1).reset_index(drop=True)

# Add an indicator for training dataset  
train_images['train']=1
train_images

Unnamed: 0,filepath,road,train
0,labelled_images/without_road/11305.png,0,1
1,labelled_images/without_road/202.png,0,1
2,labelled_images/with_road/13772.png,1,1
3,labelled_images/with_road/14626.png,1,1
4,labelled_images/with_road/13180.png,1,1
...,...,...,...
995,labelled_images/without_road/10408.png,0,1
996,labelled_images/with_road/12130.png,1,1
997,labelled_images/without_road/21864.png,0,1
998,labelled_images/with_road/14842.png,1,1


In [11]:
# add a column for grid id (gid)
train_images_list=train_images['filepath']
train_images_path=train_images_list.reset_index(drop=True)

train_images_id=pd.Series(train_images_path.apply(lambda x: os.path.splitext(os.path.split(x)[1])[0]), name='id').astype(str)
train_images_id

0      11305
1        202
2      13772
3      14626
4      13180
       ...  
995    10408
996    12130
997    21864
998    14842
999    11708
Name: id, Length: 1000, dtype: object

In [12]:
train_gid=pd.concat([train_images_path, train_images_id],axis=1).sample(frac=1.0,random_state=1).reset_index(drop=True)
train_gid

Unnamed: 0,filepath,id
0,labelled_images/without_road/22950.png,22950
1,labelled_images/without_road/196.png,196
2,labelled_images/without_road/11453.png,11453
3,labelled_images/with_road/13826.png,13826
4,labelled_images/with_road/14220.png,14220
...,...,...
995,labelled_images/without_road/9695.png,9695
996,labelled_images/with_road/13429.png,13429
997,labelled_images/with_road/12565.png,12565
998,labelled_images/without_road/11448.png,11448


In [13]:
# merge two dataset 
train_df=pd.merge(train_images,train_gid, left_on='filepath', right_on='filepath', indicator=True, how='outer')
train_df=train_df.drop(columns=['_merge'])
train_df

Unnamed: 0,filepath,road,train,id
0,labelled_images/without_road/11305.png,0,1,11305
1,labelled_images/without_road/202.png,0,1,202
2,labelled_images/with_road/13772.png,1,1,13772
3,labelled_images/with_road/14626.png,1,1,14626
4,labelled_images/with_road/13180.png,1,1,13180
...,...,...,...,...
995,labelled_images/without_road/10408.png,0,1,10408
996,labelled_images/with_road/12130.png,1,1,12130
997,labelled_images/without_road/21864.png,0,1,21864
998,labelled_images/with_road/14842.png,1,1,14842


#### 3.1.2. All Images



In [14]:
## dropbox shareable url link

url = 'https://www.dropbox.com/s/wcsr3qnoqx17njo/black_to_transparent_2km_all_pruned.zip?dl=0' # NGII_1970_clipped_maps_2km_all

# file path for download and unzip
all_images_path = './all_images'

# download dropbox shared file and unzip it
download_dropbox(url, all_images_path)

The url is valid.
Begin downloading < ./all_images/black_to_transparent_2km_all_pruned.zip >


100%|██████████| 6.81G/6.81G [01:51<00:00, 61.0MB/s]


Finished downloading < ./all_images/black_to_transparent_2km_all_pruned.zip >
Extracting zip file...
Done!


In [15]:
image_dr=Path('./all_images/black_to_transparent_2km_all_pruned/')

In [16]:
# Collect All Map Images in the Drive
images_path=pd.Series(list(image_dr.glob(r'*.png')),name='filepath').astype(str)
images_path[:1000]

0      all_images/black_to_transparent_2km_all_pruned...
1      all_images/black_to_transparent_2km_all_pruned...
2      all_images/black_to_transparent_2km_all_pruned...
3      all_images/black_to_transparent_2km_all_pruned...
4      all_images/black_to_transparent_2km_all_pruned...
                             ...                        
995    all_images/black_to_transparent_2km_all_pruned...
996    all_images/black_to_transparent_2km_all_pruned...
997    all_images/black_to_transparent_2km_all_pruned...
998    all_images/black_to_transparent_2km_all_pruned...
999    all_images/black_to_transparent_2km_all_pruned...
Name: filepath, Length: 1000, dtype: object

In [17]:
images_id=pd.Series(images_path.apply(lambda x: os.path.splitext(os.path.split(x)[1])[0]), name='id').astype(str)
images_id[:1000]

0      27243
1      27331
2      19646
3      10872
4      19837
       ...  
995    25452
996    14124
997    22001
998     3246
999     9237
Name: id, Length: 1000, dtype: object

In [18]:
images_df=pd.concat([images_path, images_id], axis=1).sample(frac=1.0,random_state=1).reset_index(drop=True)
images_df

Unnamed: 0,filepath,id
0,all_images/black_to_transparent_2km_all_pruned...,17610
1,all_images/black_to_transparent_2km_all_pruned...,12238
2,all_images/black_to_transparent_2km_all_pruned...,14652
3,all_images/black_to_transparent_2km_all_pruned...,22836
4,all_images/black_to_transparent_2km_all_pruned...,18041
...,...,...
25444,all_images/black_to_transparent_2km_all_pruned...,6510
25445,all_images/black_to_transparent_2km_all_pruned...,929
25446,all_images/black_to_transparent_2km_all_pruned...,10375
25447,all_images/black_to_transparent_2km_all_pruned...,882


#### 3.1.3. Merge Two Dataframe

In [19]:
all_maps=pd.merge(train_df,images_df, left_on='id', right_on='id', indicator=True, how='outer')

#Clean Columns
all_maps=all_maps.drop(columns=['filepath_x', '_merge']).reset_index(drop=True)
all_maps.rename(columns={'filepath_y': 'filepath'},inplace=True)

#Replace values
all_maps.loc[all_maps.train!=1,'train']=0
## set missing values equal to 2 (test generator requires the target var to be integer)
all_maps.loc[(all_maps['road']!=0)&(all_maps['road']!=1),'road']=2 

all_maps['road']=all_maps['road'].astype(str)
all_maps['filepath']=all_maps['filepath'].astype(str)

all_maps

Unnamed: 0,road,train,id,filepath
0,0.0,1.0,11305,all_images/black_to_transparent_2km_all_pruned...
1,0.0,1.0,202,all_images/black_to_transparent_2km_all_pruned...
2,1.0,1.0,13772,all_images/black_to_transparent_2km_all_pruned...
3,1.0,1.0,14626,all_images/black_to_transparent_2km_all_pruned...
4,1.0,1.0,13180,all_images/black_to_transparent_2km_all_pruned...
...,...,...,...,...
25484,2.0,0.0,6510,all_images/black_to_transparent_2km_all_pruned...
25485,2.0,0.0,929,all_images/black_to_transparent_2km_all_pruned...
25486,2.0,0.0,10375,all_images/black_to_transparent_2km_all_pruned...
25487,2.0,0.0,882,all_images/black_to_transparent_2km_all_pruned...


## 3.2. Predict all maps 
___

#### 3.2.1. Load the saved model 

In [20]:
#load the saved model from 2_classify_images_w_road.ipynb
url = 'https://www.dropbox.com/s/em95r6e7lfnsssv/best_model.zip?dl=0' 

# file path for download and unzip
model_path = './best_model'

# download dropbox shared file and unzip it
download_dropbox(url, model_path)

The url is valid.
Begin downloading < ./best_model/best_model.zip >


100%|██████████| 162M/162M [00:05<00:00, 30.7MB/s]


Finished downloading < ./best_model/best_model.zip >
Extracting zip file...
Done!


In [21]:
# load the model using keras
from keras.models import load_model

saved_model=load_model('./best_model/best_model.h5')

#### 3.2.2. Predict all maps

In [22]:
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [23]:
test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
  rescale = 1./255
)

image_size = (200,200)
target_var = 'road' # 0: without road, 1: with road, 2: no labels

test_images = test_generator.flow_from_dataframe(
  dataframe=all_maps, 
  x_col='filepath',
  y_col=target_var,
  target_size=image_size,
  color_mode='rgb',
  class_mode='categorical',
  batch_size=16,
  shuffle=False,
)

Found 25449 validated image filenames belonging to 3 classes.


  .format(n_invalid, x_col)


In [35]:
for data_batch, labels_batch in test_images:
    print('배치 데이터 크기:', data_batch.shape)
    print('배치 레이블 크기:', labels_batch.shape)
    break

배치 데이터 크기: (16, 200, 200, 3)
배치 레이블 크기: (16, 3)


In [24]:
y_cnn_pred_class = saved_model.predict(test_images)

y_cnn_pred_class[:20] # this displays probabilities of each image to be in either class

array([[7.5610781e-01, 2.4389215e-01],
       [8.4310389e-01, 1.5689613e-01],
       [7.1557008e-02, 9.2844301e-01],
       [5.6560896e-02, 9.4343913e-01],
       [5.4805223e-03, 9.9451953e-01],
       [9.1745031e-01, 8.2549646e-02],
       [3.5082044e-03, 9.9649173e-01],
       [7.0160592e-01, 2.9839411e-01],
       [8.2000244e-01, 1.7999756e-01],
       [5.3772773e-04, 9.9946231e-01],
       [1.6417620e-01, 8.3582377e-01],
       [1.7873490e-02, 9.8212653e-01],
       [9.9385852e-01, 6.1414437e-03],
       [4.2549949e-03, 9.9574500e-01],
       [7.4860173e-01, 2.5139827e-01],
       [3.8408738e-02, 9.6159130e-01],
       [5.3990144e-02, 9.4600987e-01],
       [4.2925581e-02, 9.5707440e-01],
       [9.3250197e-01, 6.7497998e-02],
       [1.3311109e-01, 8.6688888e-01]], dtype=float32)

#### 3.2.3. Tabulate the result

In [25]:
filepaths=all_maps['filepath'].to_numpy()
labels=all_maps['road'].to_numpy()

In [26]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
road_labels = le.fit_transform(labels)

road_predicted = le.inverse_transform(np.argmax(y_cnn_pred_class, axis=-1)) # convert to 0,1
road_predicted[:20]

array(['0.0', '0.0', '1.0', '1.0', '1.0', '0.0', '1.0', '0.0', '0.0',
       '1.0', '1.0', '1.0', '0.0', '1.0', '0.0', '1.0', '1.0', '1.0',
       '0.0', '1.0'], dtype=object)

In [27]:
all_maps_pred=pd.DataFrame(data=[filepaths, road_labels, road_predicted]).T
all_maps_pred.rename(columns={0: 'filepath', 1: 'road', 2: 'pred_road'},inplace=True)
all_maps_pred

Unnamed: 0,filepath,road,pred_road
0,all_images/black_to_transparent_2km_all_pruned...,0,0.0
1,all_images/black_to_transparent_2km_all_pruned...,0,0.0
2,all_images/black_to_transparent_2km_all_pruned...,1,1.0
3,all_images/black_to_transparent_2km_all_pruned...,1,1.0
4,all_images/black_to_transparent_2km_all_pruned...,1,1.0
...,...,...,...
25484,all_images/black_to_transparent_2km_all_pruned...,2,
25485,all_images/black_to_transparent_2km_all_pruned...,2,
25486,all_images/black_to_transparent_2km_all_pruned...,2,
25487,all_images/black_to_transparent_2km_all_pruned...,2,


In [42]:
# probabilites of each image to have any road in it (rounded to 3-degit decimals)
prob=np.around(y_cnn_pred_class[:,1],decimals=3)

all_maps_pred['prob_road']=pd.Series(prob)
all_maps_pred

Unnamed: 0,filepath,road,pred_road,prob_road
0,all_images/black_to_transparent_2km_all_pruned...,0,0.0,0.244
1,all_images/black_to_transparent_2km_all_pruned...,0,0.0,0.157
2,all_images/black_to_transparent_2km_all_pruned...,1,1.0,0.928
3,all_images/black_to_transparent_2km_all_pruned...,1,1.0,0.943
4,all_images/black_to_transparent_2km_all_pruned...,1,1.0,0.995
...,...,...,...,...
25484,all_images/black_to_transparent_2km_all_pruned...,2,,
25485,all_images/black_to_transparent_2km_all_pruned...,2,,
25486,all_images/black_to_transparent_2km_all_pruned...,2,,
25487,all_images/black_to_transparent_2km_all_pruned...,2,,


In [41]:
final_df=pd.merge(all_maps_pred, all_maps, left_on='filepath', right_on='filepath', indicator=False, how='inner')

#Clean Columns
final_df=final_df.drop(columns=['road_y']).reset_index(drop=True)
final_df.rename(columns={'id': 'grid_id', 'road_x': 'road'},inplace=True)
final_df = final_df.iloc[:, [0,5,4,1,2,3]] 

#Retreive Missing values 
#final_df.loc[final_df['road']==2,'road']=np.nan

final_df

Unnamed: 0,filepath,grid_id,train,road,pred_road,prob_road
0,all_images/black_to_transparent_2km_all_pruned...,11305,1.0,0,0.0,0.244
1,all_images/black_to_transparent_2km_all_pruned...,202,1.0,0,0.0,0.157
2,all_images/black_to_transparent_2km_all_pruned...,13772,1.0,1,1.0,0.928
3,all_images/black_to_transparent_2km_all_pruned...,14626,1.0,1,1.0,0.943
4,all_images/black_to_transparent_2km_all_pruned...,13180,1.0,1,1.0,0.995
...,...,...,...,...,...,...
27044,all_images/black_to_transparent_2km_all_pruned...,6510,0.0,,,
27045,all_images/black_to_transparent_2km_all_pruned...,929,0.0,,,
27046,all_images/black_to_transparent_2km_all_pruned...,10375,0.0,,,
27047,all_images/black_to_transparent_2km_all_pruned...,882,0.0,,,


In [None]:
""" 
Save the final df as csv
from google.colab import drive
drive.mount('/content/drive')

final_df.to_csv('/content/drive/MyDrive/Maps/final_df.csv')

"""

## 3.3. Visualize the prediction results
___

In [None]:
#load the NGII 1970 population grid 2km
url = 'https://www.dropbox.com/s/rkosd9lbyy8biua/Population_grid.zip?dl=0' 

# file path for download and unzip
model_path = './pop_grid'

# download dropbox shared file and unzip it
download_dropbox(url, model_path)

The url is valid.
Begin downloading < ./pop_grid/Population_grid.zip >


100%|██████████| 738k/738k [00:01<00:00, 715kB/s] 

Finished downloading < ./pop_grid/Population_grid.zip >
Extracting zip file...
Done!





In [None]:
pip install geopandas

In [None]:
import geopandas as gpd

In [None]:
dir_shp="./pop_grid/population_grid_combined_2km.shp"

pop_grid_2km=gpd.read_file(dir_shp, encoding='euc-kr') # NGII
pop_grid_2km.head(3)

Unnamed: 0,id,r2_val_r,geometry
0,1,60,"POLYGON ((746000.000 2002000.000, 748000.000 2..."
1,2,0,"POLYGON ((746000.000 2000000.000, 748000.000 2..."
2,3,0,"POLYGON ((746000.000 1998000.000, 748000.000 1..."


In [None]:
#Make dtypes of the key variable the same
final_df['grid_id']=final_df['grid_id'].astype(str)
pop_grid_2km['id']=pop_grid_2km['id'].astype(str)

#Merge Two dataframe (final df + population grid)
pop_grid_w_road_info=pd.merge(final_df, pop_grid_2km, left_on='grid_id', right_on='id', indicator=False, how='inner')

#Clean Columns
pop_grid_w_road_info=pop_grid_w_road_info.drop(columns=['id']).reset_index(drop=True)
pop_grid_w_road_info.rename(columns={'r2_val_r':'pop_2016'},inplace=True)
pop_grid_w_road_info

Unnamed: 0,filepath,grid_id,train,road,pred_road,prob_road,pop_2016,geometry
0,all_images/145.png,145,1.0,0,0.0,0.032,0,"POLYGON ((844000.000 1624000.000, 846000.000 1..."
1,all_images/138.png,138,1.0,0,0.0,0.219,0,"POLYGON ((842000.000 1646000.000, 844000.000 1..."
2,all_images/12565.png,12565,1.0,1,1.0,0.998,498,"POLYGON ((1002000.000 1916000.000, 1004000.000..."
3,all_images/9703.png,9703,1.0,1,1.0,0.819,679,"POLYGON ((976000.000 1950000.000, 978000.000 1..."
4,all_images/10437.png,10437,1.0,1,1.0,1.000,3569,"POLYGON ((982000.000 1824000.000, 984000.000 1..."
...,...,...,...,...,...,...,...,...
16167,all_images/17350.png,17350,0.0,,0.0,0.001,16,"POLYGON ((1048000.000 1808000.000, 1050000.000..."
16168,all_images/17907.png,17907,0.0,,1.0,0.788,127,"POLYGON ((1054000.000 1900000.000, 1056000.000..."
16169,all_images/1234.png,1234,0.0,,0.0,0.084,28,"POLYGON ((888000.000 1594000.000, 890000.000 1..."
16170,all_images/24349.png,24349,0.0,,0.0,0.041,0,"POLYGON ((1120000.000 1896000.000, 1122000.000..."


In [None]:
# Save the population grid as shp

A=pop_grid_w_road_info.apply(pd.to_numeric,errors='ignore')

gdf=gpd.GeoDataFrame(A)
gdf.set_geometry(col='geometry',inplace=True)

gdf.to_file('/content/drive/MyDrive/Maps/pop_grid_w_road_info.shp', encoding='euc-kr')