In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import shapely.geometry
from shapely.geometry import Point
from mpl_toolkits.basemap import Basemap
import glob
import rasterio
import os
import shutil
import pyimpute
from pathlib import Path

In [2]:
np.random.seed(42)

In [3]:
BIO_PATH = "assets/wc2.1_30s_bio/*.tif"
FILE_PATH = "generated_files\gbif.parquet"
raster_features = sorted(glob.glob(BIO_PATH))

In [4]:
# Ocurrence species data -> OSD_df
OSD_df = pd.read_parquet(FILE_PATH)
OSD_df['geometry'] = list(zip(OSD_df["Longitude"], OSD_df["Latitude"]))
OSD_df['geometry'] = OSD_df["geometry"].apply(Point)

# Create the geodataframe
OSD_geoframe = gpd.GeoDataFrame(
    OSD_df,
    crs = {'init': 'epsg:4326'},
    geometry = OSD_df['geometry']
)
OSD_geoframe = OSD_geoframe.to_crs("EPSG:4326")
OSD_geoframe.reset_index(drop=True, inplace = True)

coord_list = [(x,y) for x,y in zip(OSD_geoframe['geometry'].x , OSD_geoframe['geometry'].y)]

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [5]:
# Source: IBGE:  bcim_2016_21_11_2018
import json

STATES = ["RS","SC","PR","SP","MG","ES","RJ"]

In [7]:

INFOS_UFS = gpd.read_file("assets/FEATURES/MALHAS/bcim_2016_21_11_2018.gpkg", layer = 'lim_unidade_federacao_a')[['sigla','geometry']]
INFOS_UFS.to_file("assets/UFS_JSON", driver = "GeoJSON")

def generate_random_points(polygon, number):   
    minx, miny, maxx, maxy = polygon.bounds
    x = np.random.uniform( minx, maxx, number )
    y = np.random.uniform( miny, maxy, number )
    return x, y

with open("assets/UFS_JSON") as geofile:
    geojson_file = json.load(geofile)
INFOS_UFS = INFOS_UFS[INFOS_UFS['sigla'].isin(STATES)].reset_index()
br_union_geo = INFOS_UFS['geometry'].unary_union

gdf_poly = gpd.GeoDataFrame(index=["myPoly"], geometry=[br_union_geo], crs = {'init': 'epsg:4326'},)

x, y = generate_random_points(br_union_geo, 10_000)

df = pd.DataFrame()
df['geometry'] = list(zip(x,y))
df['geometry'] = df['geometry'].apply(Point)
gdf_points = gpd.GeoDataFrame(df, geometry='geometry')
Sjoin = gpd.tools.sjoin(gdf_points, gdf_poly, predicate="within", how='left')

# Keep points in "myPoly"
absence = gdf_points[Sjoin.index_right=='myPoly']

  in_crs_string = _prepare_from_proj_string(in_crs_string)
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: +init=epsg:4326 +type=crs

  Sjoin = gpd.tools.sjoin(gdf_points, gdf_poly, predicate="within", how='left')


In [8]:
OSD_geoframe = OSD_geoframe[['Latitude','Longitude','geometry']].copy()
OSD_geoframe['presence'] = 1

In [9]:
absence['Latitude'] = absence['geometry'].y
absence['Longitude'] = absence['geometry'].x
absence['presence'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [10]:
merged_df = pd.concat([OSD_geoframe,absence])

  return GeometryArray(data, crs=_get_common_crs(to_concat))


In [11]:
coord_list = [(x,y) for x,y in zip(merged_df['geometry'].x , merged_df['geometry'].y)]

In [12]:
# Point sampling
for f in raster_features:
    src = rasterio.open(f)
    merged_df[Path(f).stem] = [x for x in src.sample(coord_list)]
    merged_df[Path(f).stem] = merged_df[Path(f).stem].astype('float64')

In [13]:
COLUMNS_RENAME_DICT = {
    'wc2.1_30s_bio_1':'01_annual_mean_temp',
    'wc2.1_30s_bio_2':'02_mean_diurnal_range',
    'wc2.1_30s_bio_3':'03_isothermality',
    'wc2.1_30s_bio_4':'04_temperature_seasonality',
    'wc2.1_30s_bio_5':'05_maximum_temp_warmest_month',
    'wc2.1_30s_bio_6':'06_minimum_temp_coldest_month',
    'wc2.1_30s_bio_7':'07_temp_annual_range',
    'wc2.1_30s_bio_8':'08_mean_temp_wettest_quarter',
    'wc2.1_30s_bio_9':'09_mean_temp_driest_quarter',
    'wc2.1_30s_bio_10':'10_mean_temp_warmest_quarter',
    'wc2.1_30s_bio_11':'11_mean_temp_coldest_quarter',
    'wc2.1_30s_bio_12':'12_annual_precipitation',
    'wc2.1_30s_bio_13':'13_precipitation_wettest_month',
    'wc2.1_30s_bio_14':'14_precipitation_driest_month',
    'wc2.1_30s_bio_15':'15_precipitation_seasonality',
    'wc2.1_30s_bio_16':'16_precipitation_of_wettest_quarter',
    'wc2.1_30s_bio_17':'17_precipitation_of_driest_quarter',
    'wc2.1_30s_bio_18':'18_precipitation_of_warmest_quarter',
    'wc2.1_30s_bio_19':'19_precipitation_of_coldest_quarter',
}

In [14]:
merged_df.rename(columns=COLUMNS_RENAME_DICT, inplace=True)

In [15]:
merged_df.drop(columns=["geometry"]).to_parquet("generated_files/bio_variables_dataframe_occurence_and_abscence.parquet", index=False)
merged_df.drop(columns=["geometry"]).to_csv("generated_files/bio_variables_dataframe_occurence_and_abscence.csv", index=False)