In [None]:
from typing import (Any, Callable, Dict, Generic, Iterable, List, Mapping,
                    NewType, Sequence, Tuple, TypeVar, Union)

import os
import re
import sys
import traceback

import datetime
import logging

import math

from operator import itemgetter
from subprocess import PIPE, Popen, call
import importlib

import csv
import json

import numpy as np  # type: ignore
import pandas as pd  # type: ignore

import fiona  # type: ignore
from fiona.crs import from_epsg # type: ignore
import geopandas as gpd  # type: ignore

import matplotlib.pyplot as plt  # type: ignore
import seaborn as sns  # type: ignore

from arpeggio import ParserPython, visit_parse_tree  # type: ignore

from arpeggio import RegExMatch, Optional, ZeroOrMore, OneOrMore, EOF, UnorderedGroup, And, Not, Combine  # type: ignore
from arpeggio import ParserPython, PTNodeVisitor, visit_parse_tree, NoMatch  # type: ignore

import soil_lib

from soil_lib.LoimisLookups import siffer_rules_lookup, updated_texture_error_lookup

from soil_lib.LoimisGrammarV2 import update_main_siffer_l, loimisp_short, loimisp_short_dk, parse_test, split_and_cut, consolidate_loimis, test_brackets, parse_reconstituate, load_default_texture_defensively, soilParts, soilParts_dk

from soil_lib.LoimisVisitor import LpVisitor, loimis_grammar_product, test_layer_depths, set_texture_values

%matplotlib inline

In [None]:
#################
# create logger
#################
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# logfile = logging.FileHandler('temp_out/soil_convert.log', "w", encoding = "utf-8")
console = logging.StreamHandler(sys.stdout)

# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter('%(asctime)s - %(levelname)s [%(filename)s:%(lineno)d] - %(message)s')
# logfile.setFormatter(formatter)
console.setFormatter(formatter)

# add the handlers to the logger
logger.handlers = []
# logger.addHandler(logfile)
logger.addHandler(console)


In [None]:
# #############################
# global vars and path names
##############################

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (25, 10)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

soil_legend_file = "data/soil_types_legend.csv"

logger.info('##############################')
logger.info(f'# loading soil legend, estonian and english descriptions: ({soil_legend_file})')
logger.info('##############################')

soil_legend_df = pd.read_csv(soil_legend_file, encoding='latin1', sep=';', quotechar='"')
soil_legend_lookup = soil_legend_df['Tähistus kaardil'].tolist()
soil_legend = set(soil_legend_lookup)

logger.debug(soil_legend_df.sample(10))

the_soilmap_dataset = 'data/Mullakaart.shp'

soil_legend_df.sample(10)

################

siffer_update_matchinfo = 'data/soil_orig_siffer_update_matchinfo.csv'

unique_loimis_list = 'data/unique_loimis1.csv'
unique_loimis_with_siffer = 'data/unique_loimis1_with_siffer.csv'

loimis_update_parseinfo = 'data/soil_orig_loimis_update_parseinfo.csv'

shape_export_texture_values = 'data/eesti_soil_red1_texture_overview.shp'


## Load the original sourcefile, or a progress checkpoint or export

### soilmap shapefile from landboard

https://geoportaal.maaamet.ee/docs/muld/Mullakaart_SHP.zip?t=20170301161200

```
#> md5sum Mullakaart_SHP.zip
e6f25e2bd089926933df673798c9ac71 *Mullakaart_SHP.zip
```

### or from DataDOI deposit archive

With English information

https://datadoi.ut.ee/handle/33/103

In [None]:
# pre-cleanup
start = datetime.datetime.now() # strftime("%Y-%m-%d %H:%M:%S")

logger.info('initialising and loading soil map: {}'.format(start))

initial_soil_dataframe = gpd.read_file(the_soilmap_dataset, encoding='utf-8')

initial_soil_dataframe['orig_fid'] = initial_soil_dataframe.index.copy()

logger.debug(initial_soil_dataframe.sample(10))

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far: {}'.format(int(delta.total_seconds())))

In [None]:
logger.info("#########################################")
logger.info("# apply dataframe update soil type code (siffer)")
logger.info("#########################################")

initial_soil_dataframe[['upd_siffer', 'siffer_upd_info']] = initial_soil_dataframe['Siffer'].apply(lambda x: update_main_siffer_l(x, soil_legend_lookup ))

initial_soil_dataframe[['upd_sif1', 'sif1_upd_info']] = initial_soil_dataframe['Sif1'].astype(object).fillna("").apply(lambda x: update_main_siffer_l(x, soil_legend_lookup ))

initial_soil_dataframe[['upd_sif2', 'sif2_upd_info']] = initial_soil_dataframe['Sif2'].astype(object).fillna("").apply(lambda x: update_main_siffer_l(x, soil_legend_lookup ))

initial_soil_dataframe[['upd_sif3', 'sif3_upd_info']] = initial_soil_dataframe['Sif3'].astype(object).fillna("").apply(lambda x: update_main_siffer_l(x, soil_legend_lookup ))

initial_soil_dataframe[['upd_sif4', 'sif4_upd_info']] = initial_soil_dataframe['Sif4'].astype(object).fillna("").apply(lambda x: update_main_siffer_l(x, soil_legend_lookup ))

logger.debug(initial_soil_dataframe.sample(10))

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far: {}'.format(int(delta.total_seconds())))

In [None]:
logger.info("#########################################")
logger.info("# merge for soil types and WRB codes")
logger.info("#########################################")

initial_soil_dataframe = pd.merge(initial_soil_dataframe, soil_legend_df, left_on='upd_siffer', right_on='Tähistus kaardil')
initial_soil_dataframe.drop(['Tähistus kaardil','nimetus','scientific_english'], axis=1, inplace=True)

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far: {}'.format(int(delta.total_seconds())))

csv_drop1 = initial_soil_dataframe[
    [ 'orig_fid', 'Siffer', 'upd_siffer', 'siffer_upd_info', 'WRB_code', 'Varv',
      'Sif1', 'Osa1', 'upd_sif1', 'sif1_upd_info',
      'Sif2', 'Osa2', 'upd_sif2', 'sif2_upd_info',
      'Sif3', 'Osa3', 'upd_sif3', 'sif3_upd_info',
      'Sif4', 'Osa4', 'upd_sif4', 'sif4_upd_info']]

csv_drop1.to_csv(siffer_update_matchinfo, encoding="utf-8")
del(csv_drop1)

initial_soil_dataframe.drop(['siffer_upd_info', 
      'Sif1', 'Osa1', 'upd_sif1', 'sif1_upd_info',
      'Sif2', 'Osa2', 'upd_sif2', 'sif2_upd_info',
      'Sif3', 'Osa3', 'upd_sif3', 'sif3_upd_info',
      'Sif4', 'Osa4', 'upd_sif4', 'sif4_upd_info'], axis=1, inplace=True)

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far: {}'.format(int(delta.total_seconds())))

In [None]:
logger.info("#########################################")
logger.info("# step by step loimis wrangling")
logger.info("#########################################")

# TODO
t_parser = ParserPython(loimisp_short , memoization=False)
tk_parser = ParserPython(loimisp_short_dk , memoization=False)

unique_loimis1 = pd.Series(initial_soil_dataframe['Loimis1'].unique()).sort_values(ascending=True).reset_index(drop=True)
unique_loimis1.to_csv(unique_loimis_list, encoding='utf-8')

pf_df = pd.DataFrame(initial_soil_dataframe.groupby('Loimis1')['upd_siffer'].unique().reset_index())
pf_df.to_csv(unique_loimis_with_siffer, encoding='utf-8')

initial_soil_dataframe['split_layered'] = initial_soil_dataframe['Loimis1'].astype(object).fillna("no_info").apply(lambda x: split_and_cut(x))
initial_soil_dataframe['num_brackets_fixed'] = initial_soil_dataframe['split_layered'].apply(lambda x: test_brackets(x))

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far: {}'.format(int(delta.total_seconds())))

In [None]:
initial_soil_dataframe[['test_parse','test_parse_errors']] = initial_soil_dataframe['num_brackets_fixed'].apply(lambda x: parse_test(x, t_parser, tk_parser))

logger.debug(initial_soil_dataframe.sample(10))

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after parse_test): {}'.format(int(delta.total_seconds())))

initial_soil_dataframe[['loimis_reconst','has_no_info']] = initial_soil_dataframe['test_parse'].apply(lambda x: parse_reconstituate(x))

logger.debug(initial_soil_dataframe.loc[initial_soil_dataframe['has_no_info'] > 0].count())
logger.debug(initial_soil_dataframe.loc[initial_soil_dataframe['has_no_info'] > 0].sample(10))

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after parse_reconstituate): {}'.format(int(delta.total_seconds())))

In [None]:
csv_drop2 = initial_soil_dataframe[
    [ 'orig_fid', 'Loimis1', 'Loimis2', 'Lihtloimis', 'Varv',
      'split_layered', 'num_brackets_fixed',
      'test_parse','test_parse_errors',
      'loimis_reconst','has_no_info']]

csv_drop2.to_csv(loimis_update_parseinfo, encoding="utf-8")
del(csv_drop2)

initial_soil_dataframe.drop(['Loimis2',
      'split_layered', 'num_brackets_fixed',
      'test_parse','test_parse_errors'], axis=1, inplace=True)

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after parquet_checkpoint_loimis_fix): {}'.format(int(delta.total_seconds())))

In [None]:
logger.info("#########################################")
logger.info("# loimis grammar analysis")
logger.info("#########################################")

# drop värvs 0, 21, 22 (ocean/sea, under water (e.g. Peipsi), outside of EE territory))
eesti_soil_red1 = initial_soil_dataframe.loc[initial_soil_dataframe['Varv'].isin(
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])].copy()

del (initial_soil_dataframe)

eesti_soil_red1[['loimis_reconst', 'has_no_info']] = eesti_soil_red1.apply(load_default_texture_defensively, axis=1)

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after load_default_texture_defensively): {}'.format(int(delta.total_seconds())))

reorder = ['orig_fid', 'upd_siffer', 'WRB_code', 'Siffer', 'loimis_reconst', 'has_no_info', 'Loimis1', 'Lihtloimis', 'Huumus', 'Kivisus', 'Varv', 'Boniteet', 'Shape_Area', 'wkb_geom','geometry' ]
eesti_soil_red1 = eesti_soil_red1[reorder]

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after df re-org): {}'.format(int(delta.total_seconds())))

In [None]:
sp_parser = ParserPython(soilParts , memoization=False)
sp_tk_parser = ParserPython(soilParts_dk , memoization=False)

eesti_soil_red1[['loimis_grammar','parse_info']] = eesti_soil_red1['loimis_reconst'].apply(lambda x: loimis_grammar_product(x, sp_tk_parser))

loimis_stats(eesti_soil_red1, 'parse_info')
eesti_soil_red1[['Loimis1', 'loimis_reconst', 'has_no_info', 'loimis_grammar','parse_info']].sample(15)


logger.info(eesti_soil_red1.loc[eesti_soil_red1['parse_info'].isin(['empty_loimis'])].index.size)

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after loimis_grammar_product): {}'.format(int(delta.total_seconds())))

In [None]:
logger.info("#########################################")
logger.info("# loimis grammar analysis layer values")
logger.info("#########################################")

eesti_soil_red1[['nlayers', 'SOL_ZMX', 'SOL_Z1', 'SOL_Z2', 'SOL_Z3', 'SOL_Z4']] = eesti_soil_red1['loimis_grammar'].apply(lambda x: test_layer_depths(x))

logger.info(eesti_soil_red1.loc[eesti_soil_red1['SOL_ZMX'] < 100].sample(15))

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after test_layer_depths): {}'.format(int(delta.total_seconds())))

In [None]:
logger.info("#########################################")
logger.info("# apply dataframe set_texture_values ")
logger.info("#########################################")

eesti_soil_red1[['EST_TXT1', 'LXTYPE1', 'SOL_CLAY1', 'SOL_SILT1', 'SOL_SAND1', 'SOL_ROCK1',
                 'EST_TXT2', 'LXTYPE2', 'SOL_CLAY2', 'SOL_SILT2', 'SOL_SAND2', 'SOL_ROCK2',
                 'EST_TXT3', 'LXTYPE3', 'SOL_CLAY3', 'SOL_SILT3', 'SOL_SAND3', 'SOL_ROCK3',
                 'EST_TXT4', 'LXTYPE4', 'SOL_CLAY4', 'SOL_SILT4', 'SOL_SAND4', 'SOL_ROCK4']] = eesti_soil_red1['loimis_grammar'].apply(lambda x: set_texture_values(x))

logger.info(eesti_soil_red1.sample(10))

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after set_texture_values): {}'.format(int(delta.total_seconds())))

In [None]:
logger.info(eesti_soil_red1.loc[eesti_soil_red1['SOL_ROCK1'] > 20].count())

logger.info(set(eesti_soil_red1['LXTYPE1'].unique()))
logger.info(set(eesti_soil_red1['LXTYPE2'].unique()))
logger.info(set(eesti_soil_red1['LXTYPE3'].unique()))
logger.info(set(eesti_soil_red1['LXTYPE4'].unique()))

logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE1'] == 0].index.size)
logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE1'].isin(['no_info'])].index.size)
logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE1'].isin([''])].index.size)


logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE2'] == 0].index.size)
logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE2'].isin(['no_info'])].index.size)
logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE2'].isin([''])].index.size)


logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE3'] == 0].index.size)
logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE3'].isin(['no_info'])].index.size)
logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE3'].isin([''])].index.size)


logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE4'] == 0].index.size)
logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE4'].isin(['no_info'])].index.size)
logger.info(eesti_soil_red1.loc[eesti_soil_red1['LXTYPE4'].isin([''])].index.size)

# check pointing
# 'orig_fid', 'upd_siffer', 'WRB_code', 'Siffer', 'loimis_reconst', 'has_no_info', 'Loimis1', 'Lihtloimis', 'Huumus', 'Kivisus', 'Varv', 'Boniteet', 'Shape_Area', 'wkb_geom','geometry'
csv_drop1 = eesti_soil_red1[['orig_fid','Loimis1','loimis_reconst','Lihtloimis','EST_TXT1', 'LXTYPE1','EST_TXT2', 'LXTYPE2','EST_TXT3', 'LXTYPE3','EST_TXT4', 'LXTYPE4']]
csv_drop1.to_csv(soil_orig_texture_parse, encoding="utf-8")
del(csv_drop1)

In [None]:
tmp_texture_df = eesti_soil_red1[['orig_fid', 'upd_siffer', 'WRB_code',
    'Boniteet', 'Varv', 'Loimis1', 'loimis_reconst',
    'nlayers', 'SOL_ZMX', 'SOL_Z1', 'SOL_Z2', 'SOL_Z3', 'SOL_Z4',
    'EST_TXT1', 'LXTYPE1',
    'EST_TXT2', 'LXTYPE2',
    'EST_TXT3', 'LXTYPE3',
    'EST_TXT4', 'LXTYPE4',
    'SOL_CLAY1', 'SOL_SILT1', 'SOL_SAND1', 'SOL_ROCK1',
    'SOL_CLAY2', 'SOL_SILT2', 'SOL_SAND2', 'SOL_ROCK2',
    'SOL_CLAY3', 'SOL_SILT3', 'SOL_SAND3', 'SOL_ROCK3',
    'SOL_CLAY4', 'SOL_SILT4', 'SOL_SAND4', 'SOL_ROCK4', 'geometry']]

tmp_texture_df.to_file(driver='ESRI Shapefile', filename=shape_export_texture_values, encoding="utf-8")
del(tmp_texture_df)

# sys.exit(0)

tmp_step = datetime.datetime.now()
delta = tmp_step - start
logger.info('elapsed seconds so far (after checkpointing and textures physical): {}'.format(int(delta.total_seconds())))