In [1]:
import glob
import os

import geopandas as gpd
import nivapy3 as nivapy
import pandas as pd
from tqdm.notebook import tqdm

# Create PostGIS database for TEOTIL3

To streamline the workflow for TEOTIL3, I propose migrating all key datasets from Oracle to the Hub's PostGIS database. The code in this notebook creates a new schema named `teotil3` within the `general` database and loads relevant datasets to PostGIS.

In [2]:
# Connect to PostGIS
eng = nivapy.da.connect_postgis(admin=True)

Username:  ········
Password:  ········


Connection successful.


## 1. Create schema and set permissions

In [None]:
# Create schema
sql = "CREATE SCHEMA IF NOT EXISTS teotil3"
eng.execute(sql)

In [None]:
# Grant "ReadOnly" privileges to default Jovyan user
sql_list = [
    "GRANT USAGE ON SCHEMA teotil3 TO jovyan",
    "GRANT SELECT ON ALL TABLES IN SCHEMA teotil3 TO jovyan",
    "ALTER DEFAULT PRIVILEGES IN SCHEMA teotil3 GRANT SELECT ON TABLES TO jovyan",
]
for sql in sql_list:
    eng.execute(sql)

## 2. Basic non-spatial tables

In [3]:
data_fold = r"../../data"
data_year = 2022
csv_dict = {
    "ar50_artype_classes.csv": "artype",
    "lake_residence_times_10m_dem.csv": "vatnLnr",
    "offshore_hierarchy.csv": "regine",
    "spatially_static_background_coefficients.csv": "variable",
    "spatially_variable_background_coefficients.csv": "regine",
    "spatiotemporally_variable_background_coefficients.csv": "regine",
    "vassdragsomrader_ospar_regions.csv": "vassom",
}

for fname, pk_col in csv_dict.items():
    table_name = fname[:-4] + f"_{data_year}"
    fpath = os.path.join(data_fold, fname)
    df = pd.read_csv(fpath)
    df.to_sql(table_name, eng, schema="teotil3", index=False, if_exists="replace")

    sql = (
        f"ALTER TABLE teotil3.{table_name} "
        f"ADD CONSTRAINT {table_name}_pk "
        f'PRIMARY KEY ("{pk_col}")'
    )
    eng.execute(sql)

## 3. Spatial tables

In [4]:
spatial_data_fold = f"/home/jovyan/shared/teotil3/core_data_june_{data_year}"
teo_gpkg = os.path.join(spatial_data_fold, "tidied", f"teotil3_data.gpkg")
reg_gdf = gpd.read_file(teo_gpkg, layer=f"regine_{data_year}", driver="GPKG")
reg_gdf.head()

Unnamed: 0,regine,a_cat_land_km2,a_cat_poly_km2,upstr_a_km2,upstr_runoff_Mm3/yr,q_sp_m3/s/km2,runoff_mm/yr,q_cat_m3/s,vassom,ospar_region,...,fylnr_2018,komnr_2019,fylnr_2019,komnr_2020,fylnr_2020,komnr_2021,fylnr_2021,komnr_2022,fylnr_2022,geometry
0,001.10,1.16085,1.44279,0.0,0.0,0.01393,439.597368,0.016159,1,Skagerrak,...,1,101,1,3001,30,3001,30,3001,30,"MULTIPOLYGON (((297006.830 6543966.950, 297169..."
1,001.1A1,1.432412,1.432479,777.9,448.15,0.01419,447.802344,0.020292,1,Skagerrak,...,1,101,1,3001,30,3001,30,3001,30,"MULTIPOLYGON (((297505.440 6543157.790, 297543..."
2,001.1A20,0.340114,0.34016,777.9,448.15,0.01036,326.936736,0.003522,1,Skagerrak,...,1,101,1,3001,30,3001,30,3001,30,"MULTIPOLYGON (((297770.368 6543429.036, 297787..."
3,001.1A2A,17.647822,17.647822,58.96,22.97,0.0121,381.84696,0.213565,1,Skagerrak,...,1,101,1,3001,30,3001,30,3001,30,"MULTIPOLYGON (((299678.370 6544460.320, 299667..."
4,001.1A2B,41.298255,41.298255,41.3,16.23,0.01245,392.89212,0.514185,1,Skagerrak,...,1,101,1,3001,30,3001,30,3001,30,"MULTIPOLYGON (((303353.460 6552989.330, 303341..."


In [5]:
table_name = f"regine_{data_year}"

nivapy.da.gdf_to_postgis(
    reg_gdf,
    table_name,
    "teotil3",
    eng,
    f"{table_name}_spidx",
    create_pk=False,
    index=False,
    if_exists="replace",
)

sql = (
    f"ALTER TABLE teotil3.{table_name} "
    f"ADD CONSTRAINT {table_name}_pk "
    f'PRIMARY KEY ("regine")'
)
eng.execute(sql)

  self.meta.reflect(bind=self.connectable, only=[table_name], schema=schema)


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f732b616470>

## 4. HBV modelled discharge from NVE

In [None]:
# Period of data to upload
st_yr, end_yr = 2016, 2021

# Whether to replace the entire table or append to it
if_exists = "replace"

In [None]:
# Folder containing modelled data
data_fold = r"/home/jovyan/shared/teotil3/nve_hbv_data"

years = range(st_yr, end_yr + 1)
df_list = []
for year in years:
    search_path = os.path.join(data_fold, f"RID_{year}", "hbv_*.var")
    flist = glob.glob(search_path)

    # Get number of days between 1990 and year of interest
    days = len(pd.date_range(start="1990-01-01", end="%s-12-31" % year, freq="D"))

    for fpath in flist:
        name = os.path.split(fpath)[1]
        vassom = name.split("_")[1][-7:-4]

        df = pd.read_csv(
            fpath, delim_whitespace=True, header=None, names=["date", "flow_m3/s"]
        )
        df["date"] = pd.to_datetime(df["date"], format="%Y%m%d/1200")
        df["vassom"] = vassom
        df["data_supply_year"] = year + 1
        df = df[["data_supply_year", "vassom", "date", "flow_m3/s"]]

        # Check st, end and length
        assert df["date"].iloc[0] == pd.Timestamp(
            "1990-01-01"
        ), "Series does not start on 01/01/1990."
        assert df["date"].iloc[-1] == pd.Timestamp("%s-12-31" % year), (
            "Series does not end on 31/12/%s." % year
        )
        assert len(df) == days, "Unexpected length for new series."

        df_list.append(df)

df = pd.concat(df_list, axis="rows")
assert df.duplicated(["data_supply_year", "vassom", "date"], keep=False).sum() == 0

print(f"{len(df)/1e6:.1f} million rows to insert.")

In [None]:
%%time

# The databasse can't cope with writing 16 M rows directly from pandas
# Instead, manually split the dataframe into chunks and write one
# at a time
chunk_size = 100000

table_name = "nve_hbv_discharge"

if if_exists == "replace":
    # Replace with empty table
    df.iloc[:0].to_sql(
        table_name,
        eng,
        schema="teotil3",
        index=False,
        if_exists=if_exists,
    )

# Write chunks in append mode
chunks = [df[i : i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
for chunk in tqdm(chunks):
    chunk.to_sql(
        table_name,
        eng,
        schema="teotil3",
        index=False,
        if_exists="append",
        method="multi",
    )

sql = (
    f"ALTER TABLE teotil3.{table_name} "
    f"ADD CONSTRAINT {table_name}_pk "
    f"PRIMARY KEY (data_supply_year, vassom, date)"
)
eng.execute(sql)