In [3]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")

os.chdir(code_dir)


import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
import pickle
import pandas as pd
import sklearn 
import sys
import pandas as pd
from importlib import reload

import matplotlib

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import seaborn as sns
import geopandas as gpd

import rasterio

from prediction_utils import flatten_raster, upscale_grid_vector
from analysis.NL_feature_creation_and_other_NL_processing.nl_helpers import apply_polygon_mask_and_return_flat_array

## In this file, we want to get population counts associated with ADM polygons

We will use Global Human Settlement (GHS-POP) data for the year 2020, the same data product used elsewhere.

### Read shapefiles

In [2]:
gpdf = pd.read_pickle(data_dir + "int/GDL_HDI/HDI_ADM1_shapefile_clean.p")
adm2 = gpd.read_file(data_dir + "raw/geoBoundaries/geoBoundariesCGAZ_ADM2.geojson")

### Read population total raster (not the same as pop density)

In [4]:
pop_tot_path =data_dir + "/raw/GHS_pop/GHS_POP_E2020_GLOBE_R2023A_4326_30ss_V1_0.tif"
src_tot = rasterio.open(pop_tot_path)

In [5]:
src_tot

<open DatasetReader name='/shares/maps100/code/code_LS/hdi_downscaling/data//raw/GHS_pop/GHS_POP_E2020_GLOBE_R2023A_4326_30ss_V1_0.tif' mode='r'>

In [5]:
print("Global population in 2020:", (src_tot.read(1).sum()/1e9).round(3), "billion")

Global population in 2020: 7.841 billion


## Get population counts for ADM2 polygons

In [6]:
sums = []

for index in adm2.index:
    sums.append(apply_polygon_mask_and_return_flat_array(adm2["geometry"].at[index], raster_file=src_tot).sum())
    

In [8]:
adm2["total_pop"] = np.round(np.array(sums)).astype(int)
adm2["area_sq_km"] = adm2.to_crs('epsg:6933')["geometry"].area/1e6

In [9]:
adm2_pop = adm2.set_index("shapeID")[["total_pop", "area_sq_km"]]
adm2_pop.to_pickle(data_dir + "int/GHS_pop/pop_count_sums_for_ADM2_polygons.p")

In [10]:
adm2_pop.to_pickle(data_dir + "int/GHS_pop/pop_count_sums_for_ADM2_polygons.p")

## Get population counts for GDL ADM1 polygons

In [11]:
sums = []

for index in gpdf.index:
    sums.append(apply_polygon_mask_and_return_flat_array(gpdf["geometry"].at[index], raster_file=src_tot).sum())
    

In [12]:
gpdf["total_pop"] = np.round(np.array(sums)).astype(int)
gpdf["area_sq_km"] = gpdf.to_crs("epsg:6933")["geometry"].area/1e6

In [13]:
gpdf_pop = gpdf[["total_pop", "area_sq_km"]]
gpdf_pop.to_pickle(data_dir +"int/GHS_pop/pop_count_sums_for_GDL_ADM1_polygons.p")

## Get population counts for .1 x.1 degree grid

In [None]:
arr = src_tot.read(1)

In [None]:
arr[arr < 0]  = 0

In [None]:
x,y,vals = flatten_raster(arr, src_tot.transform)

In [None]:
df = pd.DataFrame({"lon":x,"lat":y,"pop_count":vals})

In [None]:
df["lat10"] = upscale_grid_vector(df["lat"],1)
df["lon10"] = upscale_grid_vector(df["lon"],1)

In [None]:
out_df = df.groupby(["lon10","lat10"]).sum().reset_index().drop(columns = ["lon","lat"])

In [None]:
out_df.to_pickle(data_dir + "int/GHS_pop/pop_count_sums_for_.1x1_grid.p")