In [None]:
import pandas as pd
import numpy as np

import platform
import matplotlib.pyplot as plt

%matplotlib inline

import platform

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~')
#### The above code is to set up the font and validate the KOREAN

plt.rcParams['axes.unicode_minus'] = False

import folium
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# This is to call the excel file and read by skipping the first row
population = pd.read_excel('D:\\OneDrive-Jongki\\OneDrive - Illinois Institute of Technology\\Jongki-study\\05_Manual\\17_Python\\DataScience\\data\\05. population_raw_data.xlsx', header=1)
# population = pd.read_excel('D:\\OneDrive - Illinois Institute of Technology\\Jongki-study\\05_Manual\\17_Python\\DataScience\\data\\05. population_raw_data.xlsx', header=1)
# '.fllna' is to fill NA/NaN values using the specified method.
# In 'method', you can fill values forward ('pad' or 'ffill') OR fill values backward ('bfill' or 'backfill') 
# With time series data, using ('pad' or 'ffill') is extremely common so that the “last known value” is available at every time point.
population.fillna(method='pad', inplace=True)

population.rename(columns = {'행정구역(동읍면)별(1)':'Province', '행정구역(동읍면)별(2)':'City-level', '계':'Population'}, inplace=True)

# This is to remove the data in '소계', which is in 'City-level'
population = population[(population['City-level'] != '소계')]

print(population)

In [None]:
# This is to avoid a warning from a happening when you copy something 
population.is_copy = False

population.rename(columns = {'항목':'Category'}, inplace=True)

# This is unllike to the previous IF syntax
# 'loc' is like go there, locator, get into there In other words, it is to access a group of rows and columns by label(s) or a boolean array. 
# Get into 'column' column and if you see 'something' in 'column', change it to 'this'
# Ex) data.loc[data['column'] == 'something', 'column'] = 'this'
population.loc[population['Category'] == '총인구수 (명)', 'Category'] = 'Total'
population.loc[population['Category'] == '남자인구수 (명)', 'Category'] = 'Male'
population.loc[population['Category'] == '여자인구수 (명)', 'Category'] = 'Female'

print(population)

In [None]:
# This is to merge and reorganize the data
population['Under 39'] = population['20 - 24세'] + population['25 - 29세'] + population['30 - 34세'] + population['35 - 39세']
population['Over 65'] = population['65 - 69세'] + population['70 - 74세'] + population['75 - 79세'] + population['80 - 84세'] + population['85 - 89세'] + population['90 - 94세'] + population['95 - 99세'] + population['100+']

print(population)

In [None]:
# This is to reorganize by using '.pivot_table'
# The indecies are 'Province' and 'City-level' and the columns divided into each 'Category'
# The values we want to pull up are 'Population', 'Under 39', 'Over 65'
# Thus, the column side has two titles: Big one -> 'Population', 'Under 39', 'Over 65' and the small one -> 'Female', 'Male', 'Total'
pop = pd.pivot_table(population, index=['Province', 'City-level'], columns=['Category'], values=['Population', 'Under 39', 'Over 65'])

print(pop)

In [None]:
# This is to get 'Shrunk rate' 
# The below equation is the definition of the district the popluation gets shrunk
pop['Shrunk rate'] = pop['Under 39', 'Female'] / (pop['Over 65', 'Total'] / 2)
print(pop)

In [None]:
# And this to screen whether a district in a population shrunk
pop['Danger in shrunk'] = pop['Shrunk rate'] < 1.0
print(pop)

In [None]:
# In 'Danger in shrunk' and if it is 'True', return the index in the level second
# '.index.get_level_values(n)' is to return an Index of values for requested level
# We have a multiple index ('Province' which is corresponding to '0' and 'City-level' has '1')
print(pop[pop['Danger in shrunk']==True].index.get_level_values(1))

In [None]:
# This is to reset index that is a multiple index, but now each data has a dual index (no-hierarchy)
pop.reset_index(inplace=True)
print(pop)

In [None]:
# This is to merge the multiple index in the column side
tmp_coloumns = [pop.columns.get_level_values(0)[n] + pop.columns.get_level_values(1)[n]
                for n in range (0,len(pop.columns.get_level_values(0)))]
# Work in 'tmp_' and infuse to 'pop'
pop.columns = tmp_coloumns
 
print(pop)

In [None]:
print(pop.info())

In [None]:
print(pop['City-level'].unique())

In [None]:
# When we check the components, they show some confusing point due to the hierarchy of the division of the province  
# Also, there is some exception
# Thus, we need to re-order them to fit into the map (json)
# Rule: '특별시' + '구', '도' + '시'
city_name = [None] * len(pop)

tmp_gu_dict = {'수원':['장안구', '권선구', '팔달구', '영통구'], 
                       '성남':['수정구', '중원구', '분당구'], 
                       '안양':['만안구', '동안구'], 
                       '안산':['상록구', '단원구'], 
                       '고양':['덕양구', '일산동구', '일산서구'], 
                       '용인':['처인구', '기흥구', '수지구'], 
                       '청주':['상당구', '서원구', '흥덕구', '청원구'], 
                       '천안':['동남구', '서북구'], 
                       '전주':['완산구', '덕진구'], 
                       '포항':['남구', '북구'], 
                       '창원':['의창구', '성산구', '진해구', '마산합포구', '마산회원구'], 
                       '부천':['오정구', '원미구', '소사구']}

for n in pop.index:
    # If a province is not in '광역시', '특별시', '자치시', 
    if pop['Province'][n][-3:] not in ['광역시', '특별시', '자치시']:
        # This loop is to avoid a potential problem in '고성' 
        if pop['City-level'][n][:-1]=='고성' and pop['Province'][n]=='강원도':
            city_name[n] = '고성(강원)'
        elif pop['City-level'][n][:-1]=='고성' and pop['Province'][n]=='경상남도':
            city_name[n] = '고성(경남)'
        # This is to cut '시', '군' 
        # Ex) '수원시' -> '수원'
        else: city_name[n] = pop['City-level'][n][:-1]
        
        for keys, values in tmp_gu_dict.items():
            if pop['City-level'][n] in values:
                if len(pop['City-level'][n]) == 2:
                    city_name[n] = keys + ' ' + pop['City-level'][n]
                elif pop['City-level'][n] in ['마산합포구', '마산회원구']:
                    city_name[n] = keys + ' ' + pop['City-level'][n][2:-1]
                else: city_name[n] = keys + ' ' + pop['City-level'][n][:-1]
    # This is the exception (too long...)
    elif pop['Province'][n] == '세종특별자치시':
        city_name[n] = '세종'
    # This is to cut the first two words in 'Province'  and have a space and then add 'City-level'
    else: 
        # When 'City-level' has only two words
        if len(pop['City-level'][n]) == 2:
            city_name[n] = pop['Province'][n][:2] + ' ' + pop['City-level'][n]
        # What if 'City-level' is not two words
        else: city_name[n] = pop['Province'][n][:2] + ' ' + pop['City-level'][n][:-1]

In [None]:
# This is to designate the 'ID' as 'city_name'
pop['ID'] = city_name

In [None]:
# This is to delete unnecessary parts in this researcj
del pop['Under 39Male']
del pop['Over 65Male']
del pop['Over 65Female']

print(pop)

In [None]:
# This is based on PinkWink's excel file which shows the location of each district and each one has around 100 k of population
draw_korea_raw = pd.read_excel('D:\\OneDrive-Jongki\\OneDrive - Illinois Institute of Technology\\Jongki-study\\05_Manual\\17_Python\\DataScience\\data\\05. draw_korea_raw.xlsx')
# draw_korea_raw = pd.read_excel('D:\\OneDrive - Illinois Institute of Technology\\Jongki-study\\05_Manual\\17_Python\\DataScience\\data\\05. draw_korea_raw.xlsx')

print(draw_korea_raw)

In [None]:
# This is to re-structure into 2-D
draw_korea_raw_stacked = pd.DataFrame(draw_korea_raw.stack())
# This is to reset the index
draw_korea_raw_stacked.reset_index(inplace=True)
draw_korea_raw_stacked.rename(columns={'level_0':'y', 'level_1':'x', 0:'ID'}, inplace=True)

print(draw_korea_raw_stacked)

In [None]:
draw_korea = draw_korea_raw_stacked

In [None]:
BORDER_LINES = [
    [(5, 1), (5,2), (7,2), (7,3), (11,3), (11,0)], # 인천
    [(5,4), (5,5), (2,5), (2,7), (4,7), (4,9), (7,9), 
     (7,7), (9,7), (9,5), (10,5), (10,4), (5,4)], # 서울
    [(1,7), (1,8), (3,8), (3,10), (10,10), (10,7), 
     (12,7), (12,6), (11,6), (11,5), (12, 5), (12,4), 
     (11,4), (11,3)], # 경기도
    [(8,10), (8,11), (6,11), (6,12)], # 강원도
    [(12,5), (13,5), (13,4), (14,4), (14,5), (15,5), 
     (15,4), (16,4), (16,2)], # 충청북도
    [(16,4), (17,4), (17,5), (16,5), (16,6), (19,6), 
     (19,5), (20,5), (20,4), (21,4), (21,3), (19,3), (19,1)], # 전라북도
    [(13,5), (13,6), (16,6)], # 대전시
    [(13,5), (14,5)], #세종시
    [(21,2), (21,3), (22,3), (22,4), (24,4), (24,2), (21,2)], #광주
    [(20,5), (21,5), (21,6), (23,6)], #전라남도
    [(10,8), (12,8), (12,9), (14,9), (14,8), (16,8), (16,6)], #충청북도
    [(14,9), (14,11), (14,12), (13,12), (13,13)], #경상북도
    [(15,8), (17,8), (17,10), (16,10), (16,11), (14,11)], #대구
    [(17,9), (18,9), (18,8), (19,8), (19,9), (20,9), (20,10), (21,10)], #부산
    [(16,11), (16,13)], #울산
#     [(9,14), (9,15)], 
    [(27,5), (27,6), (25,6)],
]

In [None]:
plt.figure(figsize=(8, 11))

# 지역 이름 표시
for idx, row in draw_korea.iterrows():
    
    # 광역시는 구 이름이 겹치는 경우가 많아서 시단위 이름도 같이 표시한다. 
    # (중구, 서구)
    if len(row['ID'].split())==2:
        dispname = '{}\n{}'.format(row['ID'].split()[0], row['ID'].split()[1])
    elif row['ID'][:2]=='고성':
        dispname = '고성'
    else:
        dispname = row['ID']

    # 서대문구, 서귀포시 같이 이름이 3자 이상인 경우에 작은 글자로 표시한다.
    if len(dispname.splitlines()[-1]) >= 3:
        fontsize, linespacing = 9.5, 1.5
    else:
        fontsize, linespacing = 11, 1.2

    plt.annotate(dispname, (row['x']+0.5, row['y']+0.5), weight='bold', fontsize=fontsize, ha='center', va='center', linespacing=linespacing)
    
# 시도 경계 그린다.
for path in BORDER_LINES:
    ys, xs = zip(*path)
    plt.plot(xs, ys, c='black', lw=1.5)

plt.gca().invert_yaxis()
#plt.gca().set_aspect(1)

plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
print(pop['ID'].unique())

In [None]:
# This is to define the 'drawkorea'
# 'gamma' is a parameter used in image processing to adjust the brightness and contrast of an image
def drawKorea(targetData, blockedMap, cmapname):
    gamma = 0.75
    
    # This is to set the 'whitlablemin' which shows you the whited zone when it is greater than 'whitelabemin'
    # Otherwise, it shows the blaked area
    # The equation is to set a threshold
    whitelabelmin = (max(blockedMap[targetData]) - min(blockedMap[targetData]))*0.25 + min(blockedMap[targetData])

    datalabel = targetData

    vmin = min(blockedMap[targetData])
    vmax = max(blockedMap[targetData])

    mapdata = blockedMap.pivot_table(index='y', columns='x', values=targetData)
    masked_mapdata = np.ma.masked_where(np.isnan(mapdata), mapdata)
    
    plt.figure(figsize=(9, 11))
    plt.pcolor(masked_mapdata, vmin=vmin, vmax=vmax, cmap=cmapname, edgecolor='#aaaaaa', linewidth=0.5)

    # 지역 이름 표시
    for idx, row in blockedMap.iterrows():
        # 광역시는 구 이름이 겹치는 경우가 많아서 시단위 이름도 같이 표시한다. 
        #(중구, 서구)
        if len(row['ID'].split())==2:
            # This is to divide the name as two lines 
            dispname = '{}\n{}'.format(row['ID'].split()[0], row['ID'].split()[1])
        elif row['ID'][:2]=='고성':
            dispname = '고성'
        else:
            dispname = row['ID']

        # 서대문구, 서귀포시 같이 이름이 3자 이상인 경우에 작은 글자로 표시한다.
        if len(dispname.splitlines()[-1]) >= 3:
            fontsize, linespacing = 10.0, 1.1
        else:
            fontsize, linespacing = 11, 1

        annocolor = 'white' if row[targetData] > whitelabelmin else 'black'
        plt.annotate(dispname, (row['x']+0.5, row['y']+0.5), weight='bold', fontsize=fontsize, ha='center', va='center', color=annocolor, linespacing=linespacing)

    # 시도 경계 그린다.
    for path in BORDER_LINES:
        ys, xs = zip(*path)
        plt.plot(xs, ys, c='black', lw=2)

    plt.gca().invert_yaxis()

    plt.axis('off')

    cb = plt.colorbar(shrink=.1, aspect=10)
    cb.set_label(datalabel)

    plt.tight_layout()
    plt.show()

In [None]:
print(draw_korea['ID'].unique())

In [None]:
# This process is to remove a data set corresponding a 'ID' by comparing to 'ID' in 'draw_korea'
tmp_list = list(set(pop['ID'].unique()) - set(draw_korea['ID'].unique()))

# If there is a 'ID' which is not in 'draw_korea', it is deleted by using '.drop'
for tmp in tmp_list:
    pop = pop.drop(pop[pop['ID']==tmp].index)

print(set(pop['ID'].unique()) - set(draw_korea['ID'].unique()))
# set()

In [None]:
# This is to merge 'pop' and 'draw_korea' and the merge point is ['ID']
# If you pick 'right' in 'how', it will be merged and re-ordered based on 'draw_korea'
pop = pd.merge(pop, draw_korea, how='left', on=['ID'])

print(pop)

In [None]:
# This is to re-order the data set: 'y' goes 'index', 'x' goes 'columns', and the desplayed data is 'PopulationTotal'
mapdata = pop.pivot_table(index='y', columns='x', values='PopulationTotal')

print(mapdata)

In [None]:
# This is to delete 'NaN' values
# E.g., "ma.masked_where(condition, a)" -> Mask an array where a 'condition' is met. Return 'a' as an array masked where condition is True. Any masked values of a or condition are also masked in the output.
# The default boolean condition is 'True'
# 'isnan' returns 'True' when the data point has 'NaN'
masked_mapdata = np.ma.masked_where(np.isnan(mapdata), mapdata)
print(masked_mapdata)

In [None]:
# This is to define the 'drawkorea'
# 'gamma' is a parameter used in image processing to adjust the brightness and contrast of an image
def drawKorea(targetData, blockedMap, cmapname):
    gamma = 0.75
    
    # This is to set the 'whitlablemin' which shows you the whited zone when it is greater than 'whitelabemin'
    # Otherwise, it shows the blaked area
    # The equation is to set a threshold
    whitelabelmin = (max(blockedMap[targetData]) - min(blockedMap[targetData]))*0.25 + min(blockedMap[targetData])

    datalabel = targetData

    vmin = min(blockedMap[targetData])
    vmax = max(blockedMap[targetData])

    mapdata = blockedMap.pivot_table(index='y', columns='x', values=targetData)
    masked_mapdata = np.ma.masked_where(np.isnan(mapdata), mapdata)
    
    plt.figure(figsize=(9, 11))
    plt.pcolor(masked_mapdata, vmin=vmin, vmax=vmax, cmap=cmapname, edgecolor='#aaaaaa', linewidth=0.5)

    # 지역 이름 표시
    for idx, row in blockedMap.iterrows():
        # 광역시는 구 이름이 겹치는 경우가 많아서 시단위 이름도 같이 표시한다. 
        #(중구, 서구)
        if len(row['ID'].split())==2:
            # This is to divide the name as two lines 
            dispname = '{}\n{}'.format(row['ID'].split()[0], row['ID'].split()[1])
        elif row['ID'][:2]=='고성':
            dispname = '고성'
        else:
            dispname = row['ID']

        # 서대문구, 서귀포시 같이 이름이 3자 이상인 경우에 작은 글자로 표시한다.
        if len(dispname.splitlines()[-1]) >= 3:
            fontsize, linespacing = 10.0, 1.1
        else:
            fontsize, linespacing = 11, 1

        annocolor = 'white' if row[targetData] > whitelabelmin else 'black'
        plt.annotate(dispname, (row['x']+0.5, row['y']+0.5), weight='bold', fontsize=fontsize, ha='center', va='center', color=annocolor, linespacing=linespacing)

    # 시도 경계 그린다.
    for path in BORDER_LINES:
        ys, xs = zip(*path)
        plt.plot(xs, ys, c='black', lw=2)

    plt.gca().invert_yaxis()

    plt.axis('off')

    cb = plt.colorbar(shrink=.1, aspect=10)
    cb.set_label(datalabel)

    plt.tight_layout()
    plt.show()

In [None]:
drawKorea('PopulationTotal', pop, 'Blues')

In [None]:
pop['Danger in shrunk'] = [1 if con else 0 for con in pop['Danger in shrunk']]
drawKorea('Danger in shrunk', pop, 'Reds')

In [None]:
# This is to define the 'drawkorea_1'
# 'gamma' is a parameter used in image processing to adjust the brightness and contrast of an image
def drawKorea_1(targetData, blockedMap, cmapname):
    gamma = 0.75
    
    # This is to set the 'whitlablemin' which shows you the whited zone when it is greater than 'whitelabemin'
    # Otherwise, it shows the blaked area
    # The equation is to set a threshold
    whitelabelmin = 20.

    datalabel = targetData

    tmp_max = max([np.abs(min(blockedMap[targetData])), np.abs(max(blockedMap[targetData]))])

    vmin, vmax = -tmp_max, tmp_max

    mapdata = blockedMap.pivot_table(index='y', columns='x', values=targetData)
    masked_mapdata = np.ma.masked_where(np.isnan(mapdata), mapdata)
    
    plt.figure(figsize=(9, 11))
    plt.pcolor(masked_mapdata, vmin=vmin, vmax=vmax, cmap=cmapname, edgecolor='#aaaaaa', linewidth=0.5)

    # 지역 이름 표시
    for idx, row in blockedMap.iterrows():
        # 광역시는 구 이름이 겹치는 경우가 많아서 시단위 이름도 같이 표시한다. 
        #(중구, 서구)
        if len(row['ID'].split())==2:
            dispname = '{}\n{}'.format(row['ID'].split()[0], row['ID'].split()[1])
        elif row['ID'][:2]=='고성':
            dispname = '고성'
        else:
            dispname = row['ID']

        # 서대문구, 서귀포시 같이 이름이 3자 이상인 경우에 작은 글자로 표시한다.
        if len(dispname.splitlines()[-1]) >= 3:
            fontsize, linespacing = 10.0, 1.1
        else:
            fontsize, linespacing = 11, 1

        annocolor = 'white' if row[targetData] > whitelabelmin else 'black'
        plt.annotate(dispname, (row['x']+0.5, row['y']+0.5), weight='bold', fontsize=fontsize, ha='center', va='center', color=annocolor, linespacing=linespacing)

    # 시도 경계 그린다.
    for path in BORDER_LINES:
        ys, xs = zip(*path)
        plt.plot(xs, ys, c='black', lw=2)

    plt.gca().invert_yaxis()

    plt.axis('off')

    cb = plt.colorbar(shrink=.1, aspect=10)
    cb.set_label(datalabel)

    plt.tight_layout()
    plt.show()

In [None]:
print(pop.head())

In [None]:
pop['Female ratio'] = (pop['PopulationFemale']/pop['PopulationTotal'] - 0.5)*100
drawKorea_1('Female ratio', pop, 'RdBu')

In [None]:
pop['Female ratio 2030'] = (pop['Under 39Female']/pop['Under 39Total'] - 0.5)*100
drawKorea_1('Female ratio 2030', pop, 'RdBu')

In [None]:
pop_folium = pop.set_index('ID')
print(pop_folium.head())

In [None]:
geo_path = 'D:\\OneDrive-Jongki\\OneDrive - Illinois Institute of Technology\\Jongki-study\\05_Manual\\17_Python\\DataScience\\data\\05. skorea_municipalities_geo_simple.json'
geo_str = json.load(open(geo_path, encoding='utf-8'))

map = folium.Map(location=[36.2002, 127.054], zoom_start=7)
map.choropleth(geo_data = geo_str,
               data = pop_folium['PopulationTotal'],columns = [pop_folium.index, pop_folium['PopulationTotal']],fill_color = 'YlGnBu', key_on = 'feature.id')

map

In [None]:
geo_path = 'D:\\OneDrive-Jongki\\OneDrive - Illinois Institute of Technology\\Jongki-study\\05_Manual\\17_Python\\DataScience\\data\\05. skorea_municipalities_geo_simple.json'
geo_str = json.load(open(geo_path, encoding='utf-8'))

map = folium.Map(location=[36.2002, 127.054], zoom_start=7)
map.choropleth(geo_data = geo_str,
               data = pop_folium['Danger in shrunk'],columns = [pop_folium.index, pop_folium['Danger in shrunk']],fill_color = 'PuRd', key_on = 'feature.id')

map