# Process population data

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [2]:
# Load libs
import pandas as pd
import geopandas as gpd
import sqlalchemy
from lib import preprocess as preprocess

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}')

Population data from [Swedish University of Agricultural Sciences](https://maps.slu.se/).

In [4]:
pop_folder = "dbs/gridded_population/"
income_path = pop_folder + "income_2012_latest_shp/IH1_20191231_SweRef_shape/SLU_IH1_SweRef_2019_region.shp"
birth_path = pop_folder + "population_2013_latest_shp/B5RutaSW_20190101/B5_SweRef_region.shp"
job_path = pop_folder + "labour_market_2012_latest_shp/A4_20191231_SweRef_shape/A4_SweRef_2020_region.shp"

In [53]:
gdf_birth = gpd.read_file(birth_path)
gdf_job = gpd.read_file(job_path).rename(columns={'astRutstor': 'Rutstorl'})
gdf_income = gpd.read_file(income_path)

## 0. Deal with duplicated grids
### 0.1 Income file

In [54]:
gdf_income.loc[:, 'Totalt'] = gdf_income.loc[:, 'Kvartil_1'] + \
                              gdf_income.loc[:, 'Kvartil_2'] + \
                              gdf_income.loc[:, 'Kvartil_3'] + \
                              gdf_income.loc[:, 'Kvartil_4']

In [55]:
duplicated_grids = gdf_income.loc[gdf_income.duplicated(subset=['Ruta']), 'Ruta'].to_list()
# Non-duplicated grids
gdf_income_1 = gdf_income.loc[~gdf_income.Ruta.isin(duplicated_grids), :]
# Duplicated grids
gdf_income_2 = gdf_income.loc[gdf_income.Ruta.isin(duplicated_grids), :].sort_values(by=['Ruta'])

In [56]:
# Prepare a big grid data as a place holder
gdf_income_2_big = gdf_income_2.loc[gdf_income_2.Rutstorl == 1000, ['Ruta', 'Rutstorl', 'geometry']]

In [57]:
def merge_income_duplicated(data):
    if 0 in data.Totalt:
        return data[['Kvartil_1', 'Kvartil_2', 'Kvartil_3', 'Kvartil_4',
                     'Totalt', 'MedianInk', 'Tot_CDISP0']].sum()
    Kvartil_1 = data['Kvartil_1'].sum()
    Kvartil_2 = data['Kvartil_2'].sum()
    Kvartil_3 = data['Kvartil_3'].sum()
    Kvartil_4 = data['Kvartil_4'].sum()
    Totalt = data['Totalt'].sum()
    Tot_CDISP0 = data['Tot_CDISP0'].sum()
    MedianInk = sum(data['Totalt'] * data['MedianInk']) / data['Totalt'].sum()
    return pd.Series(dict(Kvartil_1=Kvartil_1,
                          Kvartil_2=Kvartil_2,
                          Kvartil_3=Kvartil_3,
                          Kvartil_4=Kvartil_4,
                          Totalt=Totalt,
                          MedianInk=MedianInk,
                          Tot_CDISP0=Tot_CDISP0))

In [58]:
gdf_income_2 = pd.merge(gdf_income_2_big,
                        gdf_income_2.groupby('Ruta').apply(merge_income_duplicated).reset_index(),
                        on='Ruta').dropna()

  # This is added back by InteractiveShellApp.init_path()


In [59]:
gdf_income = pd.concat([gdf_income_1, gdf_income_2])

### 0.2 Birth background file

In [62]:
gdf_birth.loc[:, 'TotFland'] = gdf_birth.loc[:, 'Sverige'] + \
                              gdf_birth.loc[:, 'Norden_uto'] + \
                              gdf_birth.loc[:, 'EU28_utom_'] + \
                              gdf_birth.loc[:, 'Övriga_vär']

In [63]:
duplicated_grids = gdf_birth.loc[gdf_birth.duplicated(subset=['Ruta']), 'Ruta'].to_list()
# Non-duplicated grids
gdf_birth_1 = gdf_birth.loc[~gdf_birth.Ruta.isin(duplicated_grids), :]
# Duplicated grids
gdf_birth_2 = gdf_birth.loc[gdf_birth.Ruta.isin(duplicated_grids), :].sort_values(by=['Ruta'])

In [64]:
# Prepare a big grid data as a place holder
gdf_birth_2_big = gdf_birth_2.loc[gdf_birth_2.Rutstorl == 1000, ['Ruta', 'Rutstorl', 'geometry']]

In [65]:
def merge_birth_duplicated(data):
    return data[['Sverige', 'Norden_uto', 'EU28_utom_', 'Övriga_vär', 'TotFland']].sum()

In [66]:
gdf_birth_2 = pd.merge(gdf_birth_2_big,
                        gdf_birth_2.groupby('Ruta').apply(merge_birth_duplicated).reset_index(),
                        on='Ruta').dropna()
gdf_birth = pd.concat([gdf_birth_1, gdf_birth_2])

### 0.3 Job file

In [68]:
gdf_job.loc[:, 'Totalt'] = gdf_job.loc[:, 'Offentliga'] + gdf_job.loc[:, 'Naringsliv']

In [69]:
duplicated_grids = gdf_job.loc[gdf_job.duplicated(subset=['Ruta']), 'Ruta'].to_list()
# Non-duplicated grids
gdf_job_1 = gdf_job.loc[~gdf_job.Ruta.isin(duplicated_grids), :]
# Duplicated grids
gdf_job_2 = gdf_job.loc[gdf_job.Ruta.isin(duplicated_grids), :].sort_values(by=['Ruta'])

In [70]:
# Prepare a big grid data as a place holder
gdf_job_2_big = gdf_job_2.loc[gdf_job_2.Rutstorl == 1000, ['Ruta', 'Rutstorl', 'geometry']]

In [71]:
def merge_job_duplicated(data):
    return data[['Offentliga', 'Naringsliv', 'Totalt']].sum()

In [72]:
gdf_job_2 = pd.merge(gdf_job_2_big,
                     gdf_job_2.groupby('Ruta').apply(merge_job_duplicated).reset_index(),
                     on='Ruta').dropna()
gdf_job = pd.concat([gdf_job_1, gdf_job_2])

### 0.4 Relationship between three gridding systems

They share a large number of grids.

In [73]:
shared_grids = dict(pop_in_job=len(gdf_birth.loc[gdf_birth.Ruta.isin(gdf_job.Ruta)]),
                    pop_in_income=len(gdf_birth.loc[gdf_birth.Ruta.isin(gdf_income.Ruta)]),
                    pop_total=len(gdf_birth))
shared_grids

{'pop_in_job': 134831, 'pop_in_income': 211873, 'pop_total': 215917}

The small grids of 250 m x 250 m account for 86.2% of Swedish population.

In [74]:
share_small_grids_in_pop = gdf_birth.loc[gdf_birth.Rutstorl == 250, 'TotFland'].sum() / (10.28*(10**6)) * 100
print("Share of small grids in total population: %.2f %%."%share_small_grids_in_pop)

Share of small grids in total population: 85.07 %.


In [75]:
share_nonzero_grids = dict(pop=len(gdf_birth.loc[gdf_birth.TotFland != 0]) / len(gdf_birth) * 100,
                           income=len(gdf_income.loc[gdf_income.Totalt != 0]) / len(gdf_income) * 100,
                           job=len(gdf_job.loc[gdf_job.Totalt != 0]) / len(gdf_job) * 100)
share_nonzero_grids

{'pop': 92.68005761473158,
 'income': 78.62472756824647,
 'job': 76.79936852220467}

In [76]:
share_small_grids = dict(pop=len(gdf_birth.loc[gdf_birth.Rutstorl == 250]) / len(gdf_birth) * 100,
                         income=len(gdf_income.loc[gdf_income.Rutstorl == 250]) / len(gdf_income) * 100,
                         job=len(gdf_job.loc[gdf_job.Rutstorl == 250]) / len(gdf_job) * 100)
share_small_grids

{'pop': 51.30536270881867,
 'income': 51.843300949355395,
 'job': 58.93952913720915}

## 1. Organize socio-economic information and the zoning system

In [77]:
gdf = gdf_income.copy()    #gdf_income.loc[gdf_income.Rutstorl == 250, :]
print("Number of grids: %s"%len(gdf))
gdf = pd.merge(gdf, gdf_birth.loc[:, ['Ruta', 'Sverige', 'Norden_uto', 'EU28_utom_', 'Övriga_vär', 'TotFland']],
               on='Ruta')
print("After adding birth background, number of grids: %s"%len(gdf))
gdf = pd.merge(gdf, gdf_job.loc[:, ['Ruta', 'Totalt']].rename(columns={'Totalt': 'Job_count'}),
               on='Ruta', how='left')
gdf.fillna(0, inplace=True)
gdf.iloc[0]

Number of grids: 216568
After adding birth background, number of grids: 211873


Rutstorl                                                    250
Ruta                                              3230006403750
Kvartil_1                                                  31.0
Kvartil_2                                                  30.0
Kvartil_3                                                  26.0
Kvartil_4                                                  24.0
Totalt                                                    111.0
MedianInk                                              227829.0
Tot_CDISP0                                           30804340.0
geometry      POLYGON ((323000.0001730283 6403749.995208162,...
Sverige                                                     133
Norden_uto                                                    7
EU28_utom_                                                   13
Övriga_vär                                                   44
TotFland                                                    197
Job_count                               

Socio-economic dimensions

In [78]:
cls = ['Kvartil_1', 'Kvartil_2', 'Kvartil_3', 'Kvartil_4',
       'Totalt', 'MedianInk', 'Tot_CDISP0',
       'Sverige', 'Norden_uto', 'EU28_utom_', 'Övriga_vär', 'TotFland', 'Job_count']

### 1.1 Find the overlapped small grids and big grids

In [79]:
gdf_overlapped = gpd.tools.overlay(gdf.loc[gdf['Rutstorl'] == 250,
                                           ['Ruta', 'geometry']].rename(columns={'Ruta': 'Ruta_s'}),
                                   gdf.loc[gdf['Rutstorl'] == 1000,
                                           ['Ruta', 'geometry']].rename(columns={'Ruta': 'Ruta_l'}), how='intersection')
df_overlapped = gdf_overlapped.dropna(how='any').drop(columns=['geometry'])

  after removing the cwd from sys.path.


In [80]:
overlapped_ruta = df_overlapped.Ruta_s.to_list() + df_overlapped.Ruta_l.to_list()

### 1.2 Simplify the overlapped grids by merging them to big grids

In [81]:
df_overlapped_simp = pd.merge(df_overlapped,
                          gdf.loc[gdf['Rutstorl'] == 250, cls + ['Ruta']].rename(columns={'Ruta': 'Ruta_s'}),
                          on='Ruta_s')
df_overlapped_simp.columns

Index(['Ruta_s', 'Ruta_l', 'Kvartil_1', 'Kvartil_2', 'Kvartil_3', 'Kvartil_4',
       'Totalt', 'MedianInk', 'Tot_CDISP0', 'Sverige', 'Norden_uto',
       'EU28_utom_', 'Övriga_vär', 'TotFland', 'Job_count'],
      dtype='object')

In [82]:
def merge_overlapped(data):
    if 0 in data.Totalt:
        return data[cls].sum()
    MedianInk = sum(data['Totalt'] * data['MedianInk']) / data['Totalt'].sum()
    return pd.Series(dict(Kvartil_1=data['Kvartil_1'].sum(),
                          Kvartil_2=data['Kvartil_2'].sum(),
                          Kvartil_3=data['Kvartil_3'].sum(),
                          Kvartil_4=data['Kvartil_4'].sum(),
                          Totalt=data['Totalt'].sum(),
                          MedianInk=MedianInk,
                          Tot_CDISP0=data['Tot_CDISP0'].sum(),
                          Sverige=data['Sverige'].sum(),
                          Norden_uto=data['Norden_uto'].sum(),
                          EU28_utom_=data['EU28_utom_'].sum(),
                          Övriga_vär=data['Övriga_vär'].sum(),
                          TotFland=data['TotFland'].sum(),
                          Job_count=data['Job_count'].sum()))

df_overlapped_simp = df_overlapped_simp.groupby('Ruta_l').apply(merge_overlapped).reset_index().rename(columns={'Ruta_l': 'Ruta'})
df_overlapped_simp.loc[:, 'Rutstorl'] = 1000
df_overlapped_simp.fillna(0, inplace=True)

  after removing the cwd from sys.path.


In [85]:
gdf_no_overlap = gdf.loc[~gdf.Ruta.isin(overlapped_ruta), :]
df_overlap = gdf.loc[gdf.Ruta.isin(df_overlapped_simp.Ruta), cls + ['Ruta', 'Rutstorl']]
len(gdf_no_overlap), len(df_overlap)

(171321, 7599)

In [86]:
# Merge overlapped small grids and big grids statistics
df_overlap = pd.concat([df_overlap, df_overlapped_simp])
df_overlap = df_overlap.groupby('Ruta').apply(merge_overlapped).reset_index()
df_overlap.loc[:, 'Rutstorl'] = 1000
df_overlap.fillna(0, inplace=True)

  after removing the cwd from sys.path.


In [88]:
# Restore geometry
gdf_overlap = pd.merge(gdf.loc[gdf.Ruta.isin(df_overlap.Ruta), ['Ruta', 'geometry']],
                       df_overlap, on='Ruta')

In [89]:
# Create a new grid system with all ethnic and income statistics
gdf_final = pd.concat([gdf_no_overlap, gdf_overlap])
gdf_final.iloc[0]

Rutstorl                                                    250
Ruta                                              3230006403750
Kvartil_1                                                  31.0
Kvartil_2                                                  30.0
Kvartil_3                                                  26.0
Kvartil_4                                                  24.0
Totalt                                                    111.0
MedianInk                                              227829.0
Tot_CDISP0                                           30804340.0
geometry      POLYGON ((323000.0001730283 6403749.995208162,...
Sverige                                                   133.0
Norden_uto                                                  7.0
EU28_utom_                                                 13.0
Övriga_vär                                                 44.0
TotFland                                                  197.0
Job_count                               

### 1.3 Rename the columns

In [91]:
cols_selected = ['zone', 'grid_size', 'income_q1', 'income_q2', 'income_q3', 'income_q4', 'income_med',
                 'pop_income', 'birth_se', 'birth_nord', 'birth_eu', 'birth_other', 'pop', 'job', 'geometry']

In [92]:
gdf_final = gdf_final.rename(columns=dict(Ruta='zone',
                                          Rutstorl='grid_size',
                                          Kvartil_1='income_q1',
                                          Kvartil_2='income_q2',
                                          Kvartil_3='income_q3',
                                          Kvartil_4='income_q4',
                                          MedianInk='income_med',
                                          Totalt='pop_income',
                                          Sverige='birth_se',
                                          Norden_uto='birth_nord',
                                          EU28_utom_='birth_eu',
                                          Övriga_vär='birth_other',
                                          TotFland='pop',
                                          Job_count='job'))[cols_selected]
gdf_final.iloc[0]

zone                                               3230006403750
grid_size                                                    250
income_q1                                                   31.0
income_q2                                                   30.0
income_q3                                                   26.0
income_q4                                                   24.0
income_med                                              227829.0
pop_income                                                 111.0
birth_se                                                   133.0
birth_nord                                                   7.0
birth_eu                                                    13.0
birth_other                                                 44.0
pop                                                        197.0
job                                                          0.0
geometry       POLYGON ((323000.0001730283 6403749.995208162,...
Name: 0, dtype: object

### 1.4 Save the grids

In [96]:
# Save grids as .geojson and to database
gdf_final.to_postgis("grids", con=engine)
gdf_final.to_file('dbs/gridded_population/grids.geojson', driver='GeoJSON')