# Prepare environment

In [38]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

import rioxarray as rxr
from tqdm.notebook import tqdm

In [39]:
import sys
sys.path.append("..//..")

import config

# Load IBAMA data

In [40]:
folder = os.path.join(config.RAW_DATA_PATH, "ibama")

In [41]:
os.listdir(folder)

['Access_Minut_Beef_2012.tif',
 'Access_Minut_City.tif',
 'Access_Minut_soy.tif',
 'Access_Minut_soy_p25.tif',
 'Access_Minut_wood_2012.tif',
 'garimpos.tif',
 'Modelling_layers_metadata.xlsx',
 'Multas_upto2019.tif',
 'Pasture_Mapbiomas.tif',
 'Soybean_Mapbiomas.tif',
 'Terras_Devolutas.tif',
 'Terras_Devolutas_IO.tif',
 'TI_Dist.tif',
 'TI_IO.tif',
 'UCPI_dist.tif',
 'UCPI_IO.tif',
 'UCUS_Dist.tif',
 'UCUS_IO.tif']

In [48]:
gdf_list = []
for name in tqdm(os.listdir(folder)):
    if name[-3:] == "tif":
        name = name.split(".tif")[0]
        dataarray = rxr.open_rasterio(
            os.path.join(folder, f"{name}.tif")
        )
        df = dataarray[0].to_pandas()
        df.replace(dataarray[0].attrs["_FillValue"], np.nan, inplace=True)
        df = pd.melt(df, ignore_index=False).reset_index().rename(columns={"value": name})
        df.dropna(inplace=True)
        gdf = gpd.GeoDataFrame(
            df, geometry=gpd.points_from_xy(df.x, df.y)
        ).drop(["x", "y"], axis=1).reset_index(drop=True)
        
        gdf_list.append(gdf)

  0%|          | 0/18 [00:00<?, ?it/s]

In [58]:
gdf_data = pd.DataFrame([k.index[-1] for k in gdf_list], columns=["len"]).reset_index()
gdf_data["len"].value_counts()

len
5012128    6
5011816    5
5014087    4
5014103    1
5014151    1
Name: count, dtype: int64

## Aggregate datasets by geometry values

In [90]:
# validate that datasets with same size correspond to same geometry distribution
for len_ in gdf_data["len"].unique():
    len_idx = gdf_data[gdf_data["len"] == len_].index
    for i in range(len(len_idx) - 1):
        ndiff1 = sum(gdf_list[len_idx[i]]["geometry"].head(100) != gdf_list[len_idx[i+1]]["geometry"].head(100))
        ndiff2 = sum(gdf_list[len_idx[i]]["geometry"].tail(100) != gdf_list[len_idx[i+1]]["geometry"].tail(100))
        if ndiff1 > 0:
            print(1, len_, i, ndiff1)
        if ndiff2 > 0:
            print(2, len_, i, ndiff2)

In [122]:
# create a concatenated dataset for each size group
concat_gdf_list = []
for len_ in gdf_data["len"].unique():
    # datasets with same size
    len_idx = gdf_data[gdf_data["len"] == len_].index
    # correspondent geometry list
    geom_ = gdf_list[len_idx[0]]["geometry"].values
    len_gdf = gpd.GeoDataFrame(geometry=geom_)
    # add variable columns
    for idx in len_idx:
        len_idx_gdf = gdf_list[idx]
        for col in len_idx_gdf.columns:
            if col != "geometry":
                len_gdf[col] = len_idx_gdf[col]
                
    concat_gdf_list.append(len_gdf)

## Join with defined grid

In [134]:
%%time

# load grid data (takes some minutes)
frames = gpd.read_file(config.TR_FRAMES)

CPU times: total: 14min 56s
Wall time: 14min 56s


In [161]:
ibama_df = pd.DataFrame()
for concat_gdf in tqdm(concat_gdf_list):
    gdf_frames = gpd.overlay(
        frames[frames["in_borders"] == 1][["frame_id", "geometry"]],
        concat_gdf.set_crs(frames.crs),
        how="intersection", 
        keep_geom_type=False
    ).dissolve("frame_id", aggfunc="mean")\
        .drop("geometry", axis=1)
    for col in gdf_frames.columns:
        ibama_df[col] = gdf_frames[col]

  0%|          | 0/5 [00:00<?, ?it/s]

In [173]:
ibama_df.isna().sum()

Access_Minut_Beef_2012       0
Access_Minut_City            0
Access_Minut_soy             0
Access_Minut_soy_p25         0
Access_Minut_wood_2012       0
garimpos                  1783
Multas_upto2019           1783
Terras_Devolutas          1783
TI_Dist                   1783
UCPI_dist                 1783
UCUS_Dist                 1783
Pasture_Mapbiomas         1824
Soybean_Mapbiomas         1824
UCPI_IO                   1824
UCUS_IO                   1824
Terras_Devolutas_IO       1817
TI_IO                     1789
dtype: int64

In [174]:
frames[(frames["in_borders"] == 1) &
      (~frames["frame_id"].isin(ibama_df.index))].shape

(8976, 5)

# Save files

In [166]:
ibama_df.head()

Unnamed: 0_level_0,Access_Minut_Beef_2012,Access_Minut_City,Access_Minut_soy,Access_Minut_soy_p25,Access_Minut_wood_2012,garimpos,Multas_upto2019,Terras_Devolutas,TI_Dist,UCPI_dist,UCUS_Dist,Pasture_Mapbiomas,Soybean_Mapbiomas,UCPI_IO,UCUS_IO,Terras_Devolutas_IO,TI_IO
frame_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1048,1731.996704,441494.65625,1766.176025,3627.775146,1733.241089,6.405919,0.356288,0.419389,0.414698,0.0,0.930295,,,,,,
1049,1724.342529,438095.75,1758.582275,3620.121094,1725.647339,6.401271,0.358501,0.412888,0.408937,0.0,0.929087,0.0,0.0,1.0,0.0,0.0,0.0
1050,1715.001465,434546.84375,1749.272095,3610.780029,1716.337158,6.396634,0.360931,0.406487,0.403301,0.0,0.927966,,,,,,
3378,1717.439209,436251.4375,1751.699097,3613.217773,1718.76416,6.402734,0.345374,0.419785,0.413693,0.0,0.922568,0.0,0.0,1.0,0.0,0.0,0.0
3379,1710.102661,432888.8125,1744.338989,3605.881104,1711.404053,6.398072,0.347417,0.413089,0.407714,0.0,0.921259,0.0,0.0,1.0,0.0,0.0,0.0


In [167]:
# forest areas
ibama_df.to_csv(config.TR_IBAMA)