In [1]:
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st
from calendar import monthrange
import os, json
from rasterstats import zonal_stats
import rasterio

In [2]:
data_year = 2015
project_files = "/home/sergei/Downloads/vkr/code_base/project_data/"
fires_path = project_files + "dataset_plus_grid/"
fires_nonfires = f"dataset_{str(data_year)}.geojson"
meteo_data = f"irkutsk_{str(data_year)}_hourly.json"
meteo_data_path = project_files + "meteo_data/data/"
# aggregation period 7 days
agg_period = 7
project_crs = "epsg:3857"
print(
    "Paths:", project_files, fires_path, meteo_data_path, fires_nonfires, meteo_data,
    "Aggregation period:", agg_period, "Project crs:", project_crs, sep="\n"
)

Paths:
/home/sergei/Downloads/vkr/code_base/project_data/
/home/sergei/Downloads/vkr/code_base/project_data/dataset_plus_grid/
/home/sergei/Downloads/vkr/code_base/project_data/meteo_data/data/
dataset_2016.geojson
irkutsk_2016_hourly.json
Aggregation period:
7
Project crs:
epsg:3857


#### Connect meteo factors

In [3]:
fires_nonfires_ds = gpd.read_file(fires_path + fires_nonfires)
if not fires_nonfires_ds.crs == project_crs:
    print(f"Converting \"{fires_nonfires_ds.name}\" to {project_crs}")
    fires_nonfires_ds = fires_nonfires_ds.to_crs(project_crs)
print(fires_nonfires_ds, fires_nonfires_ds.columns, fires_nonfires_ds.dtypes, fires_nonfires_ds.event_date, sep="\n")

      year        lat         lon event_date  is_fire  grid_lat  grid_lon  \
0     2016  55.373500   96.901600 2016-06-20        1      55.5      97.0   
1     2016  55.356800   96.878100 2016-06-12        1      55.5      97.0   
2     2016  55.315700   96.973700 2016-06-21        1      55.5      97.0   
3     2016  55.298500   96.946800 2016-06-11        1      55.5      97.0   
4     2016  57.909100   99.470500 2016-09-18        1      58.0      99.5   
...    ...        ...         ...        ...      ...       ...       ...   
3644  2016  60.001336  109.538007 2016-04-30        0      60.0     109.5   
3645  2016  58.545747  117.853937 2016-05-30        0      58.5     118.0   
3646  2016  54.220022   96.083512 2016-07-28        0      54.0      96.0   
3647  2016  62.608092  107.632860 2016-08-17        0      62.5     107.5   
3648  2016  53.361969   98.292232 2016-04-28        0      53.5      98.5   

                                               geometry  
0     POLYGON ((1

In [4]:
# load meteo data json for the given
meteo_data_p = meteo_data_path + meteo_data
print("Path:", meteo_data_p)
with open(meteo_data_p, mode='r', encoding="Windows-1251") as f:
    meteo_data = json.load(f)
print(type(meteo_data))

Path: /home/sergei/Downloads/vkr/code_base/project_data/meteo_data/data/irkutsk_2016_hourly.json
<class 'dict'>


In [5]:
# json data exploration
for k, v in meteo_data.items():
    print("Keys:", k, type(k), "Values:", type(meteo_data[k]))

print(
    "Meta length:", len(meteo_data["METADATA"]),
    "Meta:", json.dumps(meteo_data["METADATA"], indent=4, ensure_ascii=False),
    "Data length:", len(meteo_data["DATA"]),
    "Data[0]:", json.dumps(meteo_data["DATA"][0], indent=4), sep="\n",
)

# print the data fields russian description
metadata_desc_rus = {}
for k, v in meteo_data["METADATA"].items():
    metadata_desc_rus[k] = v["DESC_RUS"]
print("Data fields russian description:", json.dumps(metadata_desc_rus, indent=4, ensure_ascii=False))

# data structure
# METADATA: {"factor1": {}, "factor2": {}, ...}, DATA: [{factor1: val1, factor2: val2, ...}, {factor1: val, ...}, ...]

# ensure that all metadata keys used in data keys
# only id is an extra key not stated in metadata
d = {}
for k in meteo_data["DATA"][0].keys():
    d[k] = k in meteo_data["METADATA"].keys()
print(d)

Keys: METADATA <class 'str'> Values: <class 'dict'>
Keys: DATA <class 'str'> Values: <class 'list'>
Meta length:
20
Meta:
{
    "SOILW40": {
        "MIN": 0.15,
        "DESC_RUS": "Влажность почвы (в слое 10-40см), %",
        "UNSIGNED": 1,
        "DESC_ENG": "Soil humidity (in layer 10-40cm), %",
        "MAX": 0.47
    },
    "SOILW200": {
        "MIN": 0.15,
        "DESC_RUS": "Влажность почвы (в слое 100-200см), %",
        "UNSIGNED": 1,
        "DESC_ENG": "Soil humidity (in layer 100-200cm), %",
        "MAX": 0.46
    },
    "TMIN": {
        "MIN": -55.6,
        "DESC_RUS": "Минимальная температура, °С",
        "DESC_ENG": "Minimum temperature, °С",
        "MAX": 32.7
    },
    "SNOD": {
        "MIN": 0,
        "DESC_RUS": "Глубина снега, м",
        "UNSIGNED": 1,
        "DESC_ENG": "Snow depth, m",
        "MAX": 1.709
    },
    "TMPGR40": {
        "MIN": -18.42,
        "DESC_RUS": "Температура почвы (в слое 10-40см), °С",
        "DESC_ENG": "Soil temperatur

In [6]:
meteo_ds = pd.DataFrame(meteo_data["DATA"])
# pd.options.display.max_columns = None
print(meteo_ds, meteo_ds.dtypes, sep="\n")

                                     ID  SOILW40  SOILW200  TMIN   SNOD  \
0        2016-01-01 00:00:00 112.5 54.5      NaN       NaN   NaN    NaN   
1             2016-01-01 00:00:00 97 58      NaN       NaN   NaN    NaN   
2            2016-01-01 00:00:00 111 59      NaN       NaN   NaN    NaN   
3            2016-01-01 00:00:00 101 56      NaN       NaN   NaN    NaN   
4        2016-01-01 00:00:00 117.5 54.5      NaN       NaN   NaN    NaN   
...                                 ...      ...       ...   ...    ...   
1789003      2016-12-31 18:00:00 103 58     0.26      0.31 -10.5  0.369   
1789004   2016-12-31 18:00:00 96.5 54.5     0.35      0.40 -16.0  0.113   
1789005     2016-12-31 18:00:00 98.5 57     0.27      0.28  -8.0  0.377   
1789006      2016-12-31 18:00:00 103 62     0.29      0.33 -19.9  0.340   
1789007    2016-12-31 18:00:00 109 62.5     0.25      0.30 -21.7  0.584   

         TMPGR40      T        DATE  WIND_SPEED  TMPGR100  ... TMPGR10    LON  \
0            NaN  

In [7]:
metadata_desc_rus

{'SOILW40': 'Влажность почвы (в слое 10-40см), %',
 'SOILW200': 'Влажность почвы (в слое 100-200см), %',
 'TMIN': 'Минимальная температура, °С',
 'SNOD': 'Глубина снега, м',
 'TMPGR40': 'Температура почвы (в слое 10-40см), °С',
 'T': 'Температура, °С',
 'DATE': 'Дата',
 'WIND_SPEED': 'Скорость ветра, м/c',
 'TMPGR100': 'Температура почвы (в слое 40-100см), °С',
 'LAT': 'Широта, °',
 'TMPGR10': 'Температура почвы (в слое 0-10см), °С',
 'LON': 'Долгота, °',
 'TMAX': 'Максимальная температура, °С',
 'RH': 'Относительная влажность, %',
 'APCP': 'Количество осадков, кг/м2',
 'TIME': 'Время',
 'WIND_DIR': 'Направление ветра, °',
 'TMPGR200': 'Температура почвы (в слое 100-200см), °С',
 'SOILW10': 'Влажность почвы (в слое 0-10см), %',
 'SOILW100': 'Влажность почвы (в слое 40-100см), %'}

In [8]:
meteo_ds["DATE"] = pd.to_datetime(meteo_ds["DATE"])
meteo_ds["LAT"] = meteo_ds["LAT"].astype(float)
meteo_ds["LON"] = meteo_ds["LON"].astype(float)
meteo_ds["LAT"].head(5), meteo_ds["LON"].head(5), meteo_ds["DATE"].head(5)

(0    54.5
 1    58.0
 2    59.0
 3    56.0
 4    54.5
 Name: LAT, dtype: float64,
 0    112.5
 1     97.0
 2    111.0
 3    101.0
 4    117.5
 Name: LON, dtype: float64,
 0   2016-01-01
 1   2016-01-01
 2   2016-01-01
 3   2016-01-01
 4   2016-01-01
 Name: DATE, dtype: datetime64[ns])

In [9]:
# convert meteo data to the gmt+8
meteo_date = meteo_ds.DATE
meteo_time = meteo_ds.TIME
meteo_date_time = pd.to_datetime(meteo_date.astype(str) + " " + meteo_time)
meteo_ds["date_time"] = meteo_date_time + pd.DateOffset(hours=8)

In [10]:
def mode_num(x: pd.Series) -> float:
    """
    Description: counts mode for the series and gets its first value
    Params:
    x - the series
    Returns: one of the series modes or numpy nan if the mode was not found
    """
    m = x.mode()
    if len(m) == 0:
        return np.nan
    return float(m[0])

ex_cols = ["ID", "DATE", "TIME", "LAT", "LON", "date_time"]
target_columns = meteo_ds.columns[~meteo_ds.columns.isin(ex_cols)]

In [11]:
# T, TMAX (?), RH, WIND_DIR, WIND_SPEED, APCP
# !!! Attention: nan and None values are skipped
# SOILW10, TMPGR10 - ?? (soil temperature and humidity at 0-10 sm). Only possible to add starting from 2006
# aggregate for the aggregation period = 7
factors = {}
for i in fires_nonfires_ds.index:
    lat = fires_nonfires_ds.iloc[i].grid_lat
    lon = fires_nonfires_ds.iloc[i].grid_lon
    date = fires_nonfires_ds.iloc[i].event_date
    start = date - pd.DateOffset(days=agg_period)
    # include the target day as well, totally there're actually 8 days for the aggregation (the last one included)
    end = date + pd.DateOffset(days=1)
    # print("Start date:", start, "End date:", end, "Grid lat:", lat, "Grid lon:", lon)
    date_mask = (meteo_ds["date_time"] >= start) & (meteo_ds["date_time"] <= end)
    meteo_date_mask = meteo_ds[date_mask]
    # print(meteo_date_mask)
    # consider the floating point error
    coord_mask = ((meteo_date_mask["LAT"] - lat).abs() <= 1.0e-5) & ((meteo_date_mask["LON"] - lon).abs() <= 1.0e-5)
    meteo_factors = meteo_date_mask[coord_mask]
    
    factors[i] = {}
    # should be competable with the pd.df.aggregate method
    agg_functions = ["mean", "max", "min", "std", mode_num, "median",]
    for col in target_columns:
        agg_d = meteo_factors[col].aggregate(agg_functions).to_dict()
        # factors[i][col] = {f"{col.lower()}_{k}" : v for k, v in agg_d.items()}
        agg_d_ = {f"{col.lower()}_{k}" : v for k, v in agg_d.items()}
        factors[i].update(agg_d_)

assert len(factors) == fires_nonfires_ds.shape[0]
k_ = json.dumps({k : factors[k] for k in list(factors.keys())[:1]}, indent=4)
print(k_)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

{
    "0": {
        "soilw40_mean": 0.36281250000000004,
        "soilw40_max": 0.39,
        "soilw40_min": 0.34,
        "soilw40_std": 0.018357449924810778,
        "soilw40_mode_num": 0.34,
        "soilw40_median": 0.37,
        "soilw200_mean": 0.4228125,
        "soilw200_max": 0.43,
        "soilw200_min": 0.42,
        "soilw200_std": 0.004568034093991748,
        "soilw200_mode_num": 0.42,
        "soilw200_median": 0.42,
        "tmin_mean": 14.390625,
        "tmin_max": 26.1,
        "tmin_min": 3.9,
        "tmin_std": 4.860264122319864,
        "tmin_mode_num": 13.3,
        "tmin_median": 14.4,
        "snod_mean": 0.0,
        "snod_max": 0.0,
        "snod_min": 0.0,
        "snod_std": 0.0,
        "snod_mode_num": 0.0,
        "snod_median": 0.0,
        "tmpgr40_mean": 5.0759375,
        "tmpgr40_max": 5.57,
        "tmpgr40_min": 4.48,
        "tmpgr40_std": 0.33725513817554564,
        "tmpgr40_mode_num": 4.48,
        "tmpgr40_median": 5.13,
        "t_mean": 1

In [12]:
meteo_factors_ds = pd.DataFrame.from_dict(factors, orient="index")
print(meteo_factors_ds.dtypes, meteo_factors_ds.shape, sep="\n")
meteo_factors_ds.head(5)

soilw40_mean         float64
soilw40_max          float64
soilw40_min          float64
soilw40_std          float64
soilw40_mode_num     float64
                      ...   
soilw100_max         float64
soilw100_min         float64
soilw100_std         float64
soilw100_mode_num    float64
soilw100_median      float64
Length: 96, dtype: object
(3649, 96)


Unnamed: 0,soilw40_mean,soilw40_max,soilw40_min,soilw40_std,soilw40_mode_num,soilw40_median,soilw200_mean,soilw200_max,soilw200_min,soilw200_std,...,soilw10_min,soilw10_std,soilw10_mode_num,soilw10_median,soilw100_mean,soilw100_max,soilw100_min,soilw100_std,soilw100_mode_num,soilw100_median
0,0.362813,0.39,0.34,0.018357,0.34,0.37,0.422812,0.43,0.42,0.004568,...,0.26,0.048692,0.27,0.335,0.398125,0.41,0.38,0.008206,0.4,0.4
1,0.34875,0.36,0.34,0.008328,0.34,0.35,0.411875,0.42,0.41,0.003966,...,0.27,0.039467,0.28,0.3,0.429688,0.45,0.41,0.009667,0.43,0.43
2,0.364375,0.39,0.34,0.017402,0.34,0.37,0.424063,0.43,0.42,0.00499,...,0.26,0.04698,0.34,0.335,0.394375,0.41,0.38,0.008776,0.4,0.4
3,0.34875,0.36,0.34,0.008328,0.34,0.35,0.410625,0.42,0.41,0.002459,...,0.27,0.03905,0.28,0.31,0.434063,0.45,0.42,0.009791,0.43,0.43
4,0.25,0.25,0.25,0.0,0.25,0.25,0.28,0.28,0.28,0.0,...,0.21,0.009837,0.23,0.22,0.26,0.26,0.26,0.0,0.26,0.26


In [13]:
# round to 6 decimal places
prec = 6
meteo_factors_ds = meteo_factors_ds.apply(lambda x: np.round(x, decimals=6))
factors_ds_nan = fires_nonfires_ds.join(meteo_factors_ds)
factors_ds_nan.dtypes

year                          int32
lat                         float64
lon                         float64
event_date           datetime64[ms]
is_fire                       int32
                          ...      
soilw100_max                float64
soilw100_min                float64
soilw100_std                float64
soilw100_mode_num           float64
soilw100_median             float64
Length: 104, dtype: object

In [14]:
# replace nans by the parameter mode calculated for the event month

nan_idx = factors_ds_nan[factors_ds_nan.isnull().any(axis=1)].index
# print(factors_ds.loc[nan_idx].geometry.geom_type)
print(nan_idx)

# for each row that has at least one nan value, extract the column of that value
# get a month of the row, get all values for the month and get mode, write to nan
factors_ds = factors_ds_nan.copy()
for idx in nan_idx:
    row = factors_ds.loc[idx]
    cols = row[row.isnull()].index.to_list()
    for col in cols:
        row_month = row.event_date.month
        m_ = factors_ds[factors_ds.event_date.dt.month == row_month][col].mode()
        m = 0 if len(m_) == 0 else m_[0]
        factors_ds.loc[idx, col] = m

# assert that no nans are left
assert factors_ds.isnull().sum().sum() == 0

Index([1083, 1086, 1090, 1091, 1101, 1103, 1104, 1106, 1107, 1119, 1129, 1634,
       1668, 1722, 1802, 1841, 1854, 1894, 1931, 1935, 1938, 2033, 2058, 2075,
       2126, 2168, 2243, 2347, 2481, 2511, 2559, 2611, 2771, 2866, 2901, 2903,
       2995, 3006, 3008, 3100, 3130, 3224, 3452, 3520, 3543],
      dtype='int64')


In [15]:
# factors_ds.explore()

#### Connect raster factors

In [16]:
# topography
topography_dir = project_files + "topography/"
elevation_path = topography_dir + "elevation.tif"
slope_path = topography_dir + "slope.tif"
aspect_path = topography_dir + "aspect.tif"
# topography columns names
elevation = "elevation"
slope = "slope"
aspect = "aspect"

# vegetation
vegetation_dir = project_files + "vegetation/"
vegetation_types_path = vegetation_dir + "CompositeMerged.tif"
# vegetation column name
vegetation_t = "vegetation_type"

print(elevation_path, slope_path, aspect_path, vegetation_types_path, elevation, slope, aspect, vegetation_t, sep="\n")

/home/sergei/Downloads/vkr/code_base/project_data/topography/elevation.tif
/home/sergei/Downloads/vkr/code_base/project_data/topography/slope.tif
/home/sergei/Downloads/vkr/code_base/project_data/topography/aspect.tif
/home/sergei/Downloads/vkr/code_base/project_data/vegetation/CompositeMerged.tif
elevation
slope
aspect
vegetation_type


In [17]:
print("We'll use a dataset with the connected meteo factors onwards")
factors_ds

We'll use a dataset with the connected meteo factors onwards


Unnamed: 0,year,lat,lon,event_date,is_fire,grid_lat,grid_lon,geometry,soilw40_mean,soilw40_max,...,soilw10_min,soilw10_std,soilw10_mode_num,soilw10_median,soilw100_mean,soilw100_max,soilw100_min,soilw100_std,soilw100_mode_num,soilw100_median
0,2016,55.373500,96.901600,2016-06-20,1,55.5,97.0,"POLYGON ((10792583.465 7430210.583, 10792583.2...",0.362813,0.39,...,0.26,0.048692,0.27,0.335,0.398125,0.41,0.38,0.008206,0.40,0.40
1,2016,55.356800,96.878100,2016-06-12,1,55.5,97.0,"POLYGON ((10775448.202 7432110.021, 10774925 7...",0.348750,0.36,...,0.27,0.039467,0.28,0.300,0.429688,0.45,0.41,0.009667,0.43,0.43
2,2016,55.315700,96.973700,2016-06-21,1,55.5,97.0,"POLYGON ((10783808.63 7422637.175, 10783808.56...",0.364375,0.39,...,0.26,0.046980,0.34,0.335,0.394375,0.41,0.38,0.008776,0.40,0.40
3,2016,55.298500,96.946800,2016-06-11,1,55.5,97.0,"POLYGON ((10795038.51 7415256.996, 10795037.99...",0.348750,0.36,...,0.27,0.039050,0.28,0.310,0.434062,0.45,0.42,0.009791,0.43,0.43
4,2016,57.909100,99.470500,2016-09-18,1,58.0,99.5,"POLYGON ((11073958.764 7948392.44, 11074854.48...",0.250000,0.25,...,0.21,0.009837,0.23,0.220,0.260000,0.26,0.26,0.000000,0.26,0.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3644,2016,60.001336,109.538007,2016-04-30,0,60.0,109.5,POINT (12193715.193 8400035.269),0.389375,0.42,...,0.37,0.030154,0.46,0.460,0.270000,0.27,0.27,0.000000,0.27,0.27
3645,2016,58.545747,117.853937,2016-05-30,0,58.5,118.0,POINT (13119440.283 8082846.395),0.351875,0.36,...,0.20,0.009158,0.20,0.210,0.340000,0.34,0.34,0.000000,0.34,0.34
3646,2016,54.220022,96.083512,2016-07-28,0,54.0,96.0,POINT (10695967.6 7211936.53),0.312500,0.32,...,0.30,0.023409,0.32,0.320,0.365625,0.37,0.36,0.005040,0.37,0.37
3647,2016,62.608092,107.632860,2016-08-17,0,62.5,107.5,POINT (11981635.22 9004792.815),0.244688,0.25,...,0.20,0.012443,0.20,0.210,0.240000,0.24,0.24,0.000000,0.24,0.24


##### Connect raster factors functions

In [18]:
def get_geometry_by_geom_type(
    geometry_column: gpd.GeoSeries, geom_type: str, raster_crs: rasterio.crs.CRS
) -> tuple[gpd.GeoSeries, gpd.GeoSeries]:
    """
    Description: extract the geometry data with a given type and in a given crs
    Params:
    geometry_column - a column that contains the GeoDataFrame geometry
    geom_type - a geometry type to be extracted
    raster_crs - a crs of a raster layer
    Returns: the GeoSeries that contains the geometry column with the given type and in the given crs and a mask according
    to which this series was extracted from the original column
    """
    # print(type(geometry_column))
    geom_mask = geometry_column.geometry.geom_type == geom_type
    masked = geometry_column[geom_mask].to_crs(raster_crs).geometry
    return masked, geom_mask

In [19]:
def add_raster_factor(factor: str, factor_path: str, 
                      dataset: gpd.GeoDataFrame, copy: bool = True, 
                      statistics: list=["mean"], add_stats: dict | None = None,
                     ) -> gpd.GeoDataFrame:
    """
    Description: extracts values from the raster file describing the factor in a given vector geometry dataset
    Params:
    factor - a factor to extract
    factor_path - a path to the factor raster file 
    dataset - a GeoDataFrame with the geometry column
    copy - copy the dataset
    statistics - the statistics to apply to polygons, default: mean, should be a list
    add_stats - is an optional argument that will be passed to the zonal statistics to calculate the custom statistics
    (for ex, get the array of pixel values that intersect the given geometry). The argument should be passed in the
    following form: {'statistics_name':function_name}. Only for print use, if not none not printed
    Returns: the dataset with the factor which values are extracted from the corresponding raster file
    Limitations: works only with the first raster band and computes zonal statistics only for the first 
    value in the statistics list
    """
    # Algorithm
    # add a factor column to the dataset
    # open the raster file
    # get the raster nodata value 
    # get all available geometry types in the dataset
    # for each geom type: 
    # separate points and polygons using get_geometry_by_geom_type function
    # if type==point, perform sampling using rasterio, if type==(multi)polygon apply zonal statistics from rasterstats
    # write values to the factor column
    # endfor
    # close file
    if copy:
        dataset = dataset.copy()
    
    dataset[factor] = 0.0
    raster_band = 1
    
    factor_raster = rasterio.open(factor_path)
    factor_raster_ds = factor_raster.read(raster_band)
    # nodatavals returns the nodata value for each band, here we get only for the first one
    nodata_value = factor_raster.nodatavals[raster_band - 1]

    factor_raster_ds = factor_raster_ds.astype("float64")
  
    geometry_column = dataset.geometry
    geom_types = geometry_column.geometry.geom_type.unique()

    supported_geom_types = ["Point", "Polygon", "MultiPolygon"]
    for geom_type in geom_types:        
        geometry_by_type, geometry_by_type_mask = get_geometry_by_geom_type(geometry_column, geom_type, factor_raster.crs)
        if geom_type == supported_geom_types[0]:
            coord_list = [(x, y) for x, y in zip(geometry_by_type.x, geometry_by_type.y)]
            factor_l = [x[0] for x in factor_raster.sample(coord_list, masked=True)]
            # to get rid of the masked values by converting them to nan
            factor_arr = np.array(factor_l).astype(float)  
            dataset.loc[geometry_by_type_mask, factor] = factor_arr     # assign values
            assert dataset[geometry_by_type_mask].geometry.geom_type.unique() == [geom_type]
        elif geom_type in supported_geom_types[1:]:
            transform = factor_raster.transform
            # source: https://pythonhosted.org/rasterstats/rasterstats.html
            # params: polygons geopandas dataframe (or path to the shp file), an array from a raster image, a coordinates
            # transformation matrix and the stats to count. all_touched is used to include pixels that're touched by
            # the geometry (by default if the geometry center doesn't intersect with the pixel that doesn't count) and
            # nodata_value is used to exclude nodata value from computation
            # pass only the geometry column as the function works a little bit faster
            # print("Statistics:", statistics)
            stats = zonal_stats(
                geometry_by_type, factor_raster_ds, 
                affine=transform, stats=statistics,
                all_touched=True, nodata=nodata_value, add_stats=add_stats,  #raster_out=False,
            )
            if not add_stats is None:
                print(stats)
            poly_stats_l = [x[statistics[0]] for x in stats]
            # to get rid of the "masked" values
            poly_stats_arr = np.array(poly_stats_l).astype(float)
            dataset.loc[geometry_by_type_mask, factor] = poly_stats_arr
            assert dataset[geometry_by_type_mask].geometry.geom_type.unique() == [geom_type]
    
    if not factor_raster.closed:
        factor_raster.close()
    
    return dataset

In [20]:
def my_stat(x):
    """
    Pass as an argument "add_stats" to the add_raster_factor function to get the underlying array of pixel values 
    that intersect the polygons
    """
    return np.ma.getdata(x)

In [21]:
statistics_ = ["mean", "majority"]
print(statistics_[1])
raster_ds = add_raster_factor(elevation, elevation_path, factors_ds)
add_raster_factor(slope, slope_path, raster_ds, copy=False)
add_raster_factor(aspect, aspect_path, raster_ds, copy=False)
add_raster_factor(vegetation_t, vegetation_types_path, raster_ds, copy=False, statistics=[statistics_[1]])
print(raster_ds)

majority


  factor_arr = np.array(factor_l).astype(float)
  factor_arr = np.array(factor_l).astype(float)
  factor_arr = np.array(factor_l).astype(float)


      year        lat         lon event_date  is_fire  grid_lat  grid_lon  \
0     2016  55.373500   96.901600 2016-06-20        1      55.5      97.0   
1     2016  55.356800   96.878100 2016-06-12        1      55.5      97.0   
2     2016  55.315700   96.973700 2016-06-21        1      55.5      97.0   
3     2016  55.298500   96.946800 2016-06-11        1      55.5      97.0   
4     2016  57.909100   99.470500 2016-09-18        1      58.0      99.5   
...    ...        ...         ...        ...      ...       ...       ...   
3644  2016  60.001336  109.538007 2016-04-30        0      60.0     109.5   
3645  2016  58.545747  117.853937 2016-05-30        0      58.5     118.0   
3646  2016  54.220022   96.083512 2016-07-28        0      54.0      96.0   
3647  2016  62.608092  107.632860 2016-08-17        0      62.5     107.5   
3648  2016  53.361969   98.292232 2016-04-28        0      53.5      98.5   

                                               geometry  soilw40_mean  \
0 

  factor_arr = np.array(factor_l).astype(float)


In [22]:
e = elevation
s = slope
asp = aspect
vt = vegetation_t
raster_ds[[elevation, slope, aspect, vegetation_t]].describe()

Unnamed: 0,elevation,slope,aspect,vegetation_type
count,3642.0,3642.0,3582.0,3610.0
mean,581.760206,2.521982,179.115979,13.786981
std,278.592608,2.645533,83.954202,4.90071
min,140.0,0.0,0.0,1.0
25%,402.5,0.951951,118.071574,12.0
50%,505.333333,1.647682,176.93772,15.0
75%,681.848837,3.084498,237.89514,18.0
max,2341.0,26.674868,359.370392,25.0


#####  None type values fill

In [23]:
# nan masks
el_nan_m = raster_ds.elevation.isnull()
slope_nan_m = raster_ds.slope.isnull()
asp_nan_m = raster_ds.aspect.isnull()
vt_nan_m = raster_ds.vegetation_type.isnull()
print(
    "The nans number for each raster factor:",
    raster_ds[el_nan_m].shape, raster_ds[slope_nan_m].shape, raster_ds[asp_nan_m].shape, raster_ds[vt_nan_m].shape
)
print(
    "Nans in the raster factors",
    raster_ds[el_nan_m],
    raster_ds[slope_nan_m],
    raster_ds[asp_nan_m],
    raster_ds[vt_nan_m],
    sep="\n"
)

The nans number for each raster factor: (7, 108) (7, 108) (67, 108) (39, 108)
Nans in the raster factors
      year        lat         lon event_date  is_fire  grid_lat  grid_lon  \
4     2016  57.909100   99.470500 2016-09-18        1      58.0      99.5   
240   2016  56.101600   97.405800 2016-04-26        1      56.0      97.5   
480   2016  58.665500  104.663000 2016-09-14        1      58.5     104.5   
1670  2016  58.926071  103.700382 2016-07-20        0      59.0     103.5   
1926  2016  63.630342  108.441738 2016-06-21        0      63.5     108.5   
2210  2016  59.103455  112.614028 2016-06-30        0      59.0     112.5   
2544  2016  53.696308   96.489557 2016-09-30        0      53.5      96.5   

                                               geometry  soilw40_mean  \
4     POLYGON ((11073958.764 7948392.44, 11074854.48...      0.250000   
240   POLYGON ((10843283.592 7577770.532, 10843283.5...      0.459375   
480   POLYGON ((11653069.811 8109496.053, 11653285.2...    

In [24]:
# first drop the vegetation type where nan
vt_nan_idx = raster_ds[vt_nan_m].index
print(vt_nan_idx)
raster_ds_clean = raster_ds.drop(index=vt_nan_idx).reset_index(drop=True)
assert abs(raster_ds_clean.shape[0] - raster_ds.shape[0]) == vt_nan_idx.shape[0]
raster_ds_clean[raster_ds_clean.vegetation_type.isnull()].shape
# raster_ds_clean.head(2), raster_ds_clean.index
# raster_ds_clean.is_fire.value_counts()

Index([ 282,  283,  284,  300,  301,  305,  306, 1714, 1764, 2052, 2157, 2178,
       2192, 2229, 2407, 2444, 2515, 2628, 2631, 2658, 2728, 2819, 2842, 2886,
       2904, 2915, 2930, 2992, 2995, 3018, 3118, 3121, 3192, 3273, 3373, 3473,
       3535, 3544, 3603],
      dtype='int64')


(0, 108)

In [25]:
# second: fill elevation and slope nans with the mean and aspect
# with the mode value for each column
# !!! Attention: changes the original dataset
prec = 6
el_nan_idx = raster_ds_clean[raster_ds_clean.elevation.isnull()].index
raster_ds_clean.loc[el_nan_idx, elevation] = raster_ds_clean.elevation.mean().round(prec)

slope_nan_idx = raster_ds_clean[raster_ds_clean.slope.isnull()].index
raster_ds_clean.loc[slope_nan_idx, slope] = raster_ds_clean.slope.mean().round(prec)

asp_nan_idx = raster_ds_clean[raster_ds_clean.aspect.isnull()].index
raster_ds_clean.loc[asp_nan_idx, aspect] = raster_ds_clean.aspect.mode()[0].round(prec)

# print(el_nan_idx, slope_nan_idx, asp_nan_idx, sep="\n")
# print(raster_ds_clean.loc[el_nan_idx, elevation], raster_ds_clean.loc[slope_nan_idx, slope],
#       raster_ds_clean.loc[asp_nan_idx, aspect], sep="\n")

In [26]:
# finally check all columns for nones/nans
print("None values left in the dataset:", raster_ds_clean.isnull().sum().sum())

None values left in the dataset: 0


#### Connect social factors

In [27]:
# social
social_dir = project_files + "social/"
roads_path = social_dir + "auto_roads.geojson"
rivers_path = social_dir + "rivers.geojson"
localities_path = social_dir + "localities_Irk_obl.geojson"
techno_objects_path = social_dir + "techno_obj.csv"
# social columns names
# roads = "elevation"
print(roads_path, rivers_path, localities_path, techno_objects_path, sep="\n")

/home/sergei/Downloads/vkr/code_base/project_data/social/auto_roads.geojson
/home/sergei/Downloads/vkr/code_base/project_data/social/rivers.geojson
/home/sergei/Downloads/vkr/code_base/project_data/social/localities_Irk_obl.geojson
/home/sergei/Downloads/vkr/code_base/project_data/social/techno_obj.csv


In [28]:
print("We'll use a dataset with the connected meteo and raster factors onwards")
raster_ds_clean.head(5)

We'll use a dataset with the connected meteo and raster factors onwards


Unnamed: 0,year,lat,lon,event_date,is_fire,grid_lat,grid_lon,geometry,soilw40_mean,soilw40_max,...,soilw100_mean,soilw100_max,soilw100_min,soilw100_std,soilw100_mode_num,soilw100_median,elevation,slope,aspect,vegetation_type
0,2016,55.3735,96.9016,2016-06-20,1,55.5,97.0,"POLYGON ((10792583.465 7430210.583, 10792583.2...",0.362813,0.39,...,0.398125,0.41,0.38,0.008206,0.4,0.4,481.88172,1.906583,143.342791,12.0
1,2016,55.3568,96.8781,2016-06-12,1,55.5,97.0,"POLYGON ((10775448.202 7432110.021, 10774925 7...",0.34875,0.36,...,0.429688,0.45,0.41,0.009667,0.43,0.43,510.92,1.098209,158.754916,12.0
2,2016,55.3157,96.9737,2016-06-21,1,55.5,97.0,"POLYGON ((10783808.63 7422637.175, 10783808.56...",0.364375,0.39,...,0.394375,0.41,0.38,0.008776,0.4,0.4,497.466667,1.347755,108.616395,12.0
3,2016,55.2985,96.9468,2016-06-11,1,55.5,97.0,"POLYGON ((10795038.51 7415256.996, 10795037.99...",0.34875,0.36,...,0.434062,0.45,0.42,0.009791,0.43,0.43,455.569106,1.753551,178.803949,12.0
4,2016,57.9091,99.4705,2016-09-18,1,58.0,99.5,"POLYGON ((11073958.764 7948392.44, 11074854.48...",0.25,0.25,...,0.26,0.26,0.26,0.0,0.26,0.26,584.334323,2.538001,135.0,11.0


In [29]:
# 
# # tried to make a function here
# def add_social_factor(dataset: gpd.GeoDataFrame, factor_name: str, 
#                       keep_columns: list[str] | None = None, ) -> gpd.GeoDataFrame:
#     # select columns we need
#     # keep_cols = ["type", "id"], "geometry"]
#     keep_cols = []
#     keep_cols = keep_columns + [dataset.geometry.name]
#     print(keep_cols, id(keep_cols) == id(keep_columns))
#     roads_ = roads[keep_cols]
#     roads_.rename(columns={"type" : "road_type", "id" : "road_id"}, inplace=True)
    
#     # # join datasets by distance
#     # roads_ds = gpd.sjoin_nearest(raster_ds_clean, roads_, how="left", distance_col="road_dist")
#     # roads_ds.drop(columns=["index_right"], inplace=True)
#     # print(roads_ds.columns, roads_ds.shape, raster_ds_clean.shape, sep="\n")
    
#     # # drop duplicates in the index
#     # print("Duplicates indeces: ", roads_ds[roads_ds.index.duplicated()].index)
#     # social_fac_ds = roads_ds[~roads_ds.index.duplicated(keep="first")]    # keep only the first found duplicated row
#     # print("Duplicates left:", social_fac_ds2.index.duplicated().sum())
#     # social_fac_ds.reset_index(drop=True, inplace=True)
#     # social_fac_ds.index
# keep_cols = ["type", "id"]
# add_social_factor(dataset=raster_ds_clean, factor_name="road", keep_columns=keep_cols)
# keep_cols
# 

In [30]:
# write the dataset with the connected social factors to the social_fac_ds
social_fac_ds = None

In [31]:
# connect roads
roads = gpd.read_file(roads_path)
if not roads.crs == project_crs:
    print(f"Converting \"{type(roads)}\" to {project_crs}")
    roads = roads.to_crs(project_crs)
print(roads.head(5), roads.dtypes, roads.shape, roads.crs, sep="\n")

# select columns we need
keep_cols = ["type", "id", "geometry"]
roads_ = roads[keep_cols]
roads_.rename(columns={"type" : "road_type", "id" : "road_id"}, inplace=True)

# join datasets by distance
roads_ds = gpd.sjoin_nearest(raster_ds_clean, roads_, how="left", distance_col="road_dist")
roads_ds.drop(columns=["index_right"], inplace=True)
print(roads_ds.columns, roads_ds.shape, raster_ds_clean.shape, sep="\n")

# drop duplicates in the index
print("Duplicates indeces: ", roads_ds[roads_ds.index.duplicated()].index)
roads_ds = roads_ds[~roads_ds.index.duplicated(keep="first")]    # keep only the first found duplicated row
print("Duplicates left:", roads_ds.index.duplicated().sum())
roads_ds.reset_index(drop=True, inplace=True)
roads_ds.index, roads_ds

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  roads_.rename(columns={"type" : "road_type", "id" : "road_id"}, inplace=True)


Converting "<class 'geopandas.geodataframe.GeoDataFrame'>" to epsg:3857
          type   id is_deleted symbol                created_by edited_by  \
0  федеральная  122          f   None  57cd206ea47f4dd00900000f      None   
1       прочая  123          f   None  57cd206ea47f4dd00900000f      None   
2       прочая  124          f   None  57cd206ea47f4dd00900000f      None   
3  федеральная  125          f   None  57cd206ea47f4dd00900000f      None   
4  федеральная  126          f   None  57cd206ea47f4dd00900000f      None   

                edited_on                       created_on published  \
0 2017-04-18 18:15:04.311 2017-04-18 18:15:04.311000+09:00         f   
1 2017-04-18 18:15:04.574 2017-04-18 18:15:04.574000+09:00         f   
2 2017-04-18 18:15:04.576 2017-04-18 18:15:04.576000+09:00         f   
3 2017-04-18 18:15:04.577 2017-04-18 18:15:04.577000+09:00         f   
4 2017-04-18 18:15:04.579 2017-04-18 18:15:04.579000+09:00         f   

                                

(RangeIndex(start=0, stop=3610, step=1),
       year        lat         lon event_date  is_fire  grid_lat  grid_lon  \
 0     2016  55.373500   96.901600 2016-06-20        1      55.5      97.0   
 1     2016  55.356800   96.878100 2016-06-12        1      55.5      97.0   
 2     2016  55.315700   96.973700 2016-06-21        1      55.5      97.0   
 3     2016  55.298500   96.946800 2016-06-11        1      55.5      97.0   
 4     2016  57.909100   99.470500 2016-09-18        1      58.0      99.5   
 ...    ...        ...         ...        ...      ...       ...       ...   
 3605  2016  60.001336  109.538007 2016-04-30        0      60.0     109.5   
 3606  2016  58.545747  117.853937 2016-05-30        0      58.5     118.0   
 3607  2016  54.220022   96.083512 2016-07-28        0      54.0      96.0   
 3608  2016  62.608092  107.632860 2016-08-17        0      62.5     107.5   
 3609  2016  53.361969   98.292232 2016-04-28        0      53.5      98.5   
 
                     

In [32]:
# connect rivers
rivers = gpd.read_file(rivers_path)
if not rivers.crs == project_crs:
    print(f"Converting \"{type(rivers)}\" to {project_crs}")
    rivers = rivers.to_crs(project_crs)
print(rivers.head(5), rivers.dtypes, rivers.shape, rivers.crs, sep="\n")

# select columns we need
keep_cols = ["id", "geometry"]
rivers_ = rivers[keep_cols]
rivers_.rename(columns={"id" : "river_id"}, inplace=True)

# join datasets by distance (use dataset with the connected roads)
rivers_ds = gpd.sjoin_nearest(roads_ds, rivers_, how="left", distance_col="river_dist")
rivers_ds.drop(columns=["index_right"], inplace=True)
print(rivers_ds.columns, rivers_ds.shape, raster_ds_clean.shape, sep="\n")

# drop duplicates in the index
print("Duplicates indeces: ", rivers_ds[rivers_ds.index.duplicated()].index)
rivers_ds = rivers_ds[~rivers_ds.index.duplicated(keep="first")]    # keep only the first found duplicated row
print("Duplicates left:", rivers_ds.index.duplicated().sum())
rivers_ds.reset_index(drop=True, inplace=True)
rivers_ds.index, rivers_ds

Converting "<class 'geopandas.geodataframe.GeoDataFrame'>" to epsg:3857
              name   basin         basin2  linewidth   sq  id is_deleted  \
0         Артюгина  Енисей       Артюгина        0.3  A01   1          f   
1    Верх. Сарчиха  Енисей  Верх. Сарчиха        0.3  A01   2          f   
2  Каменный Дубчес  Енисей         Дубчес        0.3  A01   3          f   
3           Елогуй  Енисей         Елогуй        0.3  A01   8          f   
4          Делтула  Енисей          Бахта        0.3  A02  19          f   

                 created_by edited_by               edited_on  \
0  50f7a1d80d58140037000006      None 2016-05-27 16:52:38.553   
1  50f7a1d80d58140037000006      None 2016-05-27 16:52:40.242   
2  50f7a1d80d58140037000006      None 2016-05-27 16:52:40.272   
3  50f7a1d80d58140037000006      None 2016-05-27 16:52:40.284   
4  50f7a1d80d58140037000006      None 2016-05-27 16:52:40.305   

                        created_on published  \
0 2016-05-27 16:52:38.553000+08:

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rivers_.rename(columns={"id" : "river_id"}, inplace=True)


Index(['year', 'lat', 'lon', 'event_date', 'is_fire', 'grid_lat', 'grid_lon',
       'geometry', 'soilw40_mean', 'soilw40_max',
       ...
       'soilw100_median', 'elevation', 'slope', 'aspect', 'vegetation_type',
       'road_type', 'road_id', 'road_dist', 'river_id', 'river_dist'],
      dtype='object', length=113)
(3620, 113)
(3610, 108)
Duplicates indeces:  Index([50, 94, 196, 197, 529, 532, 542, 1194, 1370, 1417], dtype='int64')
Duplicates left: 0


(RangeIndex(start=0, stop=3610, step=1),
       year        lat         lon event_date  is_fire  grid_lat  grid_lon  \
 0     2016  55.373500   96.901600 2016-06-20        1      55.5      97.0   
 1     2016  55.356800   96.878100 2016-06-12        1      55.5      97.0   
 2     2016  55.315700   96.973700 2016-06-21        1      55.5      97.0   
 3     2016  55.298500   96.946800 2016-06-11        1      55.5      97.0   
 4     2016  57.909100   99.470500 2016-09-18        1      58.0      99.5   
 ...    ...        ...         ...        ...      ...       ...       ...   
 3605  2016  60.001336  109.538007 2016-04-30        0      60.0     109.5   
 3606  2016  58.545747  117.853937 2016-05-30        0      58.5     118.0   
 3607  2016  54.220022   96.083512 2016-07-28        0      54.0      96.0   
 3608  2016  62.608092  107.632860 2016-08-17        0      62.5     107.5   
 3609  2016  53.361969   98.292232 2016-04-28        0      53.5      98.5   
 
                     

In [33]:
# connect localities
locs = gpd.read_file(localities_path)
if not locs.crs == project_crs:
    print(f"Converting \"{type(locs)}\" to {project_crs}")
    locs = locs.to_crs(project_crs)
print(locs.head(5), locs.columns, locs.dtypes, locs.shape, locs.crs, sep="\n")

# select columns we need
keep_cols = ["type", "id", "geometry"]
locs_ = locs[keep_cols]
locs_.rename(columns={"type" : "locality_type", "id" : "locality_id"}, inplace=True)

# join datasets by distance (use rivers_ds)
locs_ds = gpd.sjoin_nearest(rivers_ds, locs_, how="left", distance_col="locality_dist")
locs_ds.drop(columns=["index_right"], inplace=True)
print(locs_ds.columns, locs_ds.shape, raster_ds_clean.shape, sep="\n")

# drop duplicates in the index
print("Duplicates indeces: ", locs_ds[locs_ds.index.duplicated()].index)
locs_ds = locs_ds[~locs_ds.index.duplicated(keep="first")]    # keep only the first found duplicated row
print("Duplicates left:", locs_ds.index.duplicated().sum())
locs_ds.reset_index(drop=True, inplace=True)
locs_ds.index, locs_ds.head(2)

Converting "<class 'geopandas.geodataframe.GeoDataFrame'>" to epsg:3857
            name              type                                    name_MO  \
0          Аларь     село сельский          Муниципальное образование «Аларь»   
1  Александровск     село сельский  Муниципальное образование «Александровск»   
2        Алзобей  деревня сельский          Муниципальное образование «Аларь»   
3          Аляты     село сельский          Муниципальное образование «Аляты»   
4      Ангарский  поселок сельский      Муниципальное образование «Ангарский»   

          code distance       ado      id  \
0  25123902001       44  Аларский  01.���   
1  25123904001       10  Аларский  01.���   
2  25123902002       36  Аларский  01.���   
3  25123907001       55  Аларский  01.���   
4  25123910001       55  Аларский  01.���   

                                           query  \
0          Аларь село Аларский иркутская область   
1  Александровск село Аларский иркутская область   
2     Алзобей 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  locs_.rename(columns={"type" : "locality_type", "id" : "locality_id"}, inplace=True)


(RangeIndex(start=0, stop=3610, step=1),
    year      lat      lon event_date  is_fire  grid_lat  grid_lon  \
 0  2016  55.3735  96.9016 2016-06-20        1      55.5      97.0   
 1  2016  55.3568  96.8781 2016-06-12        1      55.5      97.0   
 
                                             geometry  soilw40_mean  \
 0  POLYGON ((10792583.465 7430210.583, 10792583.2...      0.362813   
 1  POLYGON ((10775448.202 7432110.021, 10774925 7...      0.348750   
 
    soilw40_max  ...      aspect  vegetation_type    road_type  road_id  \
 0         0.39  ...  143.342791             12.0  федеральная      145   
 1         0.36  ...  158.754916             12.0  федеральная      145   
 
        road_dist  river_id   river_dist     locality_type  locality_id  \
 0  141540.161582       387     0.000000  деревня сельский        24.13   
 1  144243.686388       387  7466.074068  деревня сельский        24.13   
 
    locality_dist  
 0   71677.947067  
 1   81299.287791  
 
 [2 rows x 116 c

In [34]:
techno_obj_raw = gpd.pd.read_csv(techno_objects_path, sep=";", usecols=["id", "WKT"])
techno_obj_geom = gpd.GeoSeries.from_wkt(techno_obj_raw["WKT"])
techno_obj = gpd.GeoDataFrame(data=techno_obj_raw, geometry=techno_obj_geom, crs="epsg:4326")
if not techno_obj.crs == project_crs:
    print(f"Converting \"{type(techno_obj)}\" to {project_crs}")
    techno_obj = techno_obj.to_crs(project_crs)
print(techno_obj.head(5), techno_obj.columns, techno_obj.dtypes, techno_obj.shape, techno_obj.crs, sep="\n")

# select columns we need
keep_cols = ["geometry"]
techno_obj_ = techno_obj[keep_cols]
# techno_obj_.rename(columns={"id" : "techno_obj_id"}, inplace=True)
# print(techno_obj_)

# join datasets by distance (use locs_ds)
techno_obj_ds = gpd.sjoin_nearest(locs_ds, techno_obj_, how="left", distance_col="techno_obj_dist")
techno_obj_ds.drop(columns=["index_right"], inplace=True)
print(techno_obj_ds.columns, techno_obj_ds.shape, raster_ds_clean.shape, sep="\n")
# print(techno_obj_ds.head(5))
# techno_obj_ds[techno_obj_ds.techno_obj_dist.isnull() != True]

# drop duplicates in the index
print("Duplicates indeces: ", techno_obj_ds[techno_obj_ds.index.duplicated()].index)
social_fac_ds = techno_obj_ds[~techno_obj_ds.index.duplicated(keep="first")]    # keep only the first found duplicated row
print("Duplicates left:", social_fac_ds.index.duplicated().sum())
social_fac_ds.reset_index(drop=True, inplace=True)
social_fac_ds.index

Converting "<class 'geopandas.geodataframe.GeoDataFrame'>" to epsg:3857
   id                                                WKT  \
0   0  MULTIPOLYGON (((105.999880426311 56.7999540577...   
1   1  MULTIPOLYGON (((105.985713935395 56.8240593653...   
2   2  MULTIPOLYGON (((105.967972910118 56.8666326931...   
3   3  MULTIPOLYGON (((105.974120505029 56.8693588037...   
4   4  MULTIPOLYGON (((105.98247195472 56.85509207094...   

                                            geometry  
0  MULTIPOLYGON (((11799852.713 7719340.458, 1179...  
1  MULTIPOLYGON (((11798275.707 7724242.632, 1179...  
2  MULTIPOLYGON (((11796300.785 7732908.265, 1179...  
3  MULTIPOLYGON (((11796985.132 7733463.49, 11796...  
4  MULTIPOLYGON (((11797914.811 7730558.243, 1179...  
Index(['id', 'WKT', 'geometry'], dtype='object')
id             int64
WKT           object
geometry    geometry
dtype: object
(1809, 3)
epsg:3857
Index(['year', 'lat', 'lon', 'event_date', 'is_fire', 'grid_lat', 'grid_lon',
       'geome

RangeIndex(start=0, stop=3610, step=1)

In [35]:
print(social_fac_ds.dtypes, social_fac_ds.shape, social_fac_ds.crs, sep="\n")
social_fac_ds.head(5)

year                        int32
lat                       float64
lon                       float64
event_date         datetime64[ms]
is_fire                     int32
                        ...      
river_dist                float64
locality_type              object
locality_id                object
locality_dist             float64
techno_obj_dist           float64
Length: 117, dtype: object
(3610, 117)
EPSG:3857


Unnamed: 0,year,lat,lon,event_date,is_fire,grid_lat,grid_lon,geometry,soilw40_mean,soilw40_max,...,vegetation_type,road_type,road_id,road_dist,river_id,river_dist,locality_type,locality_id,locality_dist,techno_obj_dist
0,2016,55.3735,96.9016,2016-06-20,1,55.5,97.0,"POLYGON ((10792583.465 7430210.583, 10792583.2...",0.362813,0.39,...,12.0,федеральная,145,141540.161582,387,0.0,деревня сельский,24.13,71677.947067,71675.144461
1,2016,55.3568,96.8781,2016-06-12,1,55.5,97.0,"POLYGON ((10775448.202 7432110.021, 10774925 7...",0.34875,0.36,...,12.0,федеральная,145,144243.686388,387,7466.074068,деревня сельский,24.13,81299.287791,81292.680149
2,2016,55.3157,96.9737,2016-06-21,1,55.5,97.0,"POLYGON ((10783808.63 7422637.175, 10783808.56...",0.364375,0.39,...,12.0,федеральная,145,153615.572695,387,1558.898454,деревня сельский,24.13,78742.794577,78778.038109
3,2016,55.2985,96.9468,2016-06-11,1,55.5,97.0,"POLYGON ((10795038.51 7415256.996, 10795037.99...",0.34875,0.36,...,12.0,федеральная,145,155251.233439,387,0.0,деревня сельский,24.13,77162.777572,77206.240617
4,2016,57.9091,99.4705,2016-09-18,1,58.0,99.5,"POLYGON ((11073958.764 7948392.44, 11074854.48...",0.25,0.25,...,11.0,прочая,151,23068.115932,264,27436.05501,село сельский,31.37,50226.247078,19845.983899


In [36]:
# drop rows where the distance to the locality < 50 m or to the techno object < 100 m
# then drop techno_obj_dist colum
loc_col = "locality_dist"
tech_col = "techno_obj_dist"
idx_ = social_fac_ds[(social_fac_ds[loc_col] < 50) | (social_fac_ds[tech_col] < 100)].index
print(social_fac_ds[(social_fac_ds[loc_col] < 50)].shape, social_fac_ds[(social_fac_ds[tech_col] < 100)].shape)
s_ds = social_fac_ds.drop(index=idx_)

soc_fac_ds = s_ds.reset_index(drop=True)
print(soc_fac_ds.shape, social_fac_ds.shape)
print(
    "Distance to locality < 50:",soc_fac_ds[soc_fac_ds[loc_col] < 50].shape, 
    "Distance to techno obj < 100:", soc_fac_ds[soc_fac_ds[tech_col] < 100].shape,
    sep="\n"
)
print("Duplicates in idx:", soc_fac_ds.index.duplicated().sum())
vc = soc_fac_ds.is_fire.value_counts()
print("Fires number:", vc[1], "Non-fires number:", vc[0])

print("Drop techno_obj_dist column")
soc_fac_ds.drop(columns=["techno_obj_dist"], inplace=True)
print("Dropped?", not soc_fac_ds.columns.isin(["techno_obj_dist"]).any())
print("Shape:", soc_fac_ds.shape)

(129, 117) (71, 117)
(3436, 117) (3610, 117)
Distance to locality < 50:
(0, 117)
Distance to techno obj < 100:
(0, 117)
Duplicates in idx: 0
Fires number: 1485 Non-fires number: 1951
Drop techno_obj_dist column
Dropped? True
Shape: (3436, 116)


In [40]:
# m = soc_fac_ds.loc[:, ["lat", "lon", "geometry"]].explore(color="red")
m = soc_fac_ds.explore(color="red")
rivers_.explore(m=m, color="blue")
roads_.explore(m=m, color="black")
locs_.explore(m=m, color="green")
techno_obj_.explore(m=m, color="purple")

# save the interactive map
maps_dir = "interactive_maps" + "/fires_geo_factors_maps"
if not os.path.exists(maps_dir):
    os.mkdir(maps_dir)
m.save(maps_dir + f"/fires_geo_factors_map_{data_year}.html")

In [38]:
# write file
write_path = project_files + "fires_with_factors/"
if not os.path.exists(write_path):
  os.mkdir(write_path)
soc_fac_ds.to_file(write_path + "factor_dataset_" + str(data_year) + ".geojson")