# Extraction of ESA World cover data ( soil ) % crops, % urban area, % woods

In [32]:
import warnings
warnings.filterwarnings("ignore")

import planetary_computer as pc
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.spatial import cKDTree

import pystac_client
import planetary_computer as pc
import requests
import time

from odc.stac import stac_load

from datetime import date
from tqdm import tqdm
import os

tqdm.pandas()

## Running only Unique coords

In [37]:


def get_lulc_features(row):
    max_retries = 3
    try:
        for attempt in range(max_retries):
            try:
                lat, lon = row['Latitude'], row['Longitude']
                catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
                
                # 1. Definimos um buffer de aprox. 1km ao redor do ponto (0.009 graus)
                # É importante ver o entorno para saber se há fazendas ou cidades perto
                buffer = 0.009 
                bbox = [lon - buffer, lat - buffer, lon + buffer, lat + buffer]
                
                search = catalog.search(collections=["esa-worldcover"], bbox=bbox)
                items = list(search.get_items())
                
                if not items:
                    return pd.Series({'pct_agriculture': np.nan, 'pct_urban': np.nan, 'pct_natural': np.nan})
                    
                # 2. Carregar o mapa de classes (banda "map")
                selected_item = items[0]
                data = stac_load(
                    [selected_item], 
                    bands=["map"], 
                    bbox=bbox, 
                    chunks={},
                    patch_url=pc.sign_url
                ).compute().isel(time=0)
                
                lulc = data["map"].values
                total_pixels = lulc.size
                
                # 3. Calcular porcentagens das classes de interesse
                # Classe 40 = Cropland (Agricultura)
                pct_agri = np.sum(lulc == 40) / total_pixels * 100
                
                # Classe 50 = Built-up (Urbano)
                pct_urban = np.sum(lulc == 50) / total_pixels * 100
                
                # Classes 10, 20, 30 = Vegetação Natural (Trees, Shrubland, Grassland)
                pct_natural = np.sum(np.isin(lulc, [10, 20, 30])) / total_pixels * 100
                
                return pd.Series({
                    'pct_agriculture': float(pct_agri),
                    'pct_urban': float(pct_urban),
                    'pct_natural': float(pct_natural)
                })
            except Exception as e:
                if attempt <= max_retries:
                    print(f"Erro ao processar ponto ({lat}, {lon}): {e}")
                    print("tentando novamente...")
                    time.sleep(3 ** attempt)  # Espera antes de tentar novamente
                time.sleep(2)  # Espera antes de tentar novamente

    except Exception as e:
        print(f"Erro ao processar ponto ({lat}, {lon}) (tentativa {attempt + 1}): {e}")
        return pd.Series({'pct_agriculture': np.nan, 'pct_urban': np.nan, 'pct_natural': np.nan})

def get_lulc_features_v2(row):
    max_retries = 3
    
    try:
        for attempt in range(max_retries):
            try:
                lat, lon = row['Latitude'], row['Longitude']
                catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
                
                buffer = 0.009 
                bbox = [lon - buffer, lat - buffer, lon + buffer, lat + buffer]
                
                search = catalog.search(collections=["esa-worldcover"], bbox=bbox)
                items = list(search.get_items())
                
                if not items:
                    return pd.Series({'pct_agri': 0, 'pct_urban': 0, 'pct_natural': 0, 'pct_water': 0, 'pct_wetlands': 0})
                    
                selected_item = pc.sign(items[0])
                data = stac_load([selected_item], bands=["map"], bbox=bbox, chunks={}).compute().isel(time=0)
                lulc = data["map"].values
                total_pixels = lulc.size
                
                # 1. Agricultura
                pct_agri = np.sum(lulc == 40) / total_pixels * 100
                
                # 2. Urbano
                pct_urban = np.sum(lulc == 50) / total_pixels * 100
                
                # 3. Vegetação Natural (Florestas, Arbustos, Campos)
                pct_natural = np.sum(np.isin(lulc, [10, 20, 30])) / total_pixels * 100
                
                # 4. Água (Rios e Lagos)
                pct_water = np.sum(lulc == 80) / total_pixels * 100
                
                # 5. Áreas Alagadas (Wetlands e Mangues)
                pct_wetlands = np.sum(np.isin(lulc, [90, 95])) / total_pixels * 100
                
                # 6. Outros (Solo exposto, Neve, etc. - Para fechar os 100%)
                pct_others = 100 - (pct_agri + pct_urban + pct_natural + pct_water + pct_wetlands)
                
                return pd.Series({
                    'pct_agri': pct_agri,
                    'pct_urban': pct_urban,
                    'pct_natural': pct_natural,
                    'pct_water': pct_water,
                    'pct_wetlands': pct_wetlands,
                    'pct_others': max(0, pct_others) # Garante que não seja negativo por erro de arredondamento
                })
                
            except Exception as e:
                if attempt <= max_retries:
                    print(f"Erro ao processar ponto ({lat}, {lon}): {e}")
                    print("tentando novamente...")
                    time.sleep(3 ** attempt)  # Espera antes de tentar novamente
                time.sleep(2)  # Espera antes de tentar novamente

    except Exception:
        return pd.Series({'pct_agri': np.nan, 'pct_urban': np.nan, 'pct_natural': np.nan, 'pct_water': np.nan, 'pct_wetlands': np.nan, 'pct_others': np.nan})

## Training data

In [34]:
Water_Quality_df = pd.read_csv('water_quality_training_dataset.csv')
unique_coords = Water_Quality_df[['Latitude', 'Longitude']].drop_duplicates().reset_index(drop=True)
unique_coords.shape

(162, 2)

In [35]:



unique_10 = unique_coords[0:10]  # Usando apenas os últimos 10 registros para teste rápido
unique_10

Unnamed: 0,Latitude,Longitude
0,-28.760833,17.730278
1,-26.861111,28.884722
2,-26.45,28.085833
3,-27.671111,27.236944
4,-27.356667,27.286389
5,-27.010111,26.698083
6,-25.127778,27.628889
7,-25.20639,27.558
8,-24.69514,27.40906
9,-26.984722,26.632278


In [38]:
lulc_results = unique_coords.progress_apply(get_lulc_features_v2, axis=1)
lulc_final_df = pd.concat([unique_coords, lulc_results], axis=1)

lulc_final_df


100%|██████████| 162/162 [04:44<00:00,  1.76s/it]


Unnamed: 0,Latitude,Longitude,pct_agri,pct_urban,pct_natural,pct_water,pct_wetlands,pct_others
0,-28.760833,17.730278,0.000000,0.000000,12.073306,12.318655,0.0,75.608039
1,-26.861111,28.884722,43.965682,0.628597,55.248572,0.157149,0.0,0.000000
2,-26.450000,28.085833,33.746999,1.914009,59.443587,4.880401,0.0,0.015003
3,-27.671111,27.236944,4.635902,42.714859,47.679925,4.253647,0.0,0.715666
4,-27.356667,27.286389,33.081584,1.169141,65.740741,0.006400,0.0,0.002133
...,...,...,...,...,...,...,...,...
157,-32.601389,18.750556,47.187241,0.862197,51.795536,0.000000,0.0,0.155026
158,-28.308889,31.902500,0.151476,0.008534,96.516044,3.302611,0.0,0.021335
159,-24.958611,29.395278,7.613243,2.406082,86.540381,2.070547,0.0,1.369747
160,-26.619444,27.980833,40.337515,5.572623,53.908517,0.010667,0.0,0.170678


In [39]:
train_df = pd.read_csv('water_quality_training_dataset.csv').drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'])

train_df_final = pd.merge(
    train_df, 
    lulc_final_df, 
    on=['Latitude', 'Longitude'], 
    how='left'
)

train_df_final.to_csv("../Datasets/soil_use_data_training.csv", index=False)
train_df_final



Unnamed: 0,Latitude,Longitude,Sample Date,pct_agri,pct_urban,pct_natural,pct_water,pct_wetlands,pct_others
0,-28.760833,17.730278,02-01-2011,0.000000,0.000000,12.073306,12.318655,0.0,75.608039
1,-26.861111,28.884722,03-01-2011,43.965682,0.628597,55.248572,0.157149,0.0,0.000000
2,-26.450000,28.085833,03-01-2011,33.746999,1.914009,59.443587,4.880401,0.0,0.015003
3,-27.671111,27.236944,03-01-2011,4.635902,42.714859,47.679925,4.253647,0.0,0.715666
4,-27.356667,27.286389,03-01-2011,33.081584,1.169141,65.740741,0.006400,0.0,0.002133
...,...,...,...,...,...,...,...,...,...
9314,-27.527500,30.858056,23-12-2015,8.378136,0.804318,90.399386,0.100273,0.0,0.317887
9315,-26.861111,28.884722,23-12-2015,43.965682,0.628597,55.248572,0.157149,0.0,0.000000
9316,-26.984722,26.632278,23-12-2015,42.536473,5.222026,52.152307,0.008495,0.0,0.080698
9317,-27.935000,26.126667,23-12-2015,28.079990,0.199331,70.994084,0.289352,0.0,0.437243


## getting validation data

In [40]:
validation_template = pd.read_csv('submission_template.csv')
unique_submission_coords = validation_template[['Latitude', 'Longitude']].drop_duplicates().reset_index(drop=True)
unique_submission_coords.shape

(24, 2)

In [41]:
lulc_results_validation = unique_submission_coords.progress_apply(get_lulc_features_v2, axis=1)


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]


In [42]:

lulc_final_validation_df = pd.concat([unique_submission_coords, lulc_results_validation], axis=1)

lulc_final_validation_df

Unnamed: 0,Latitude,Longitude,pct_agri,pct_urban,pct_natural,pct_water,pct_wetlands,pct_others
0,-32.043333,27.822778,5.760369,8.689623,83.8475,0.524834,0.0,1.177675
1,-33.329167,26.0775,0.0,0.5637,99.434156,0.0,0.0,0.002143347
2,-32.991639,27.640028,0.019113,0.155026,3.512498,0.0,0.0,96.31336
3,-34.096389,24.439167,16.156767,0.0,83.828298,0.0,0.0,0.01493429
4,-32.000556,28.581667,0.887523,0.232548,89.782813,8.44214,0.0,0.6549753
5,-32.08639,25.57556,43.109856,0.180509,54.528658,1.841194,0.0,0.3397821
6,-33.185361,27.39075,0.014934,1.516897,97.409968,0.398959,0.305086,0.354156
7,-33.731111,24.618333,0.004267,0.025602,99.750384,0.192012,0.0,0.02773511
8,-31.905,25.43,47.320816,1.363169,48.756859,1.129544,0.0,1.429612
9,-32.515278,28.015556,3.191828,0.014865,92.269957,3.314999,1.06819,0.1401601


In [43]:
validation_df = pd.read_csv('submission_template.csv').drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'])

validation_df_final = pd.merge(
    validation_df, 
    lulc_final_validation_df, 
    on=['Latitude', 'Longitude'], 
    how='left'
)

validation_df_final.to_csv("../Datasets/soil_use_data_validation.csv",index=False)
validation_df_final

Unnamed: 0,Latitude,Longitude,Sample Date,pct_agri,pct_urban,pct_natural,pct_water,pct_wetlands,pct_others
0,-32.043333,27.822778,01-09-2014,5.760369,8.689623,83.847500,0.524834,0.000000,1.177675
1,-33.329167,26.077500,16-09-2015,0.000000,0.563700,99.434156,0.000000,0.000000,0.002143
2,-32.991639,27.640028,07-05-2015,0.019113,0.155026,3.512498,0.000000,0.000000,96.313364
3,-34.096389,24.439167,07-02-2012,16.156767,0.000000,83.828298,0.000000,0.000000,0.014934
4,-32.000556,28.581667,01-10-2014,0.887523,0.232548,89.782813,8.442140,0.000000,0.654975
...,...,...,...,...,...,...,...,...,...
195,-33.771111,25.386667,06-12-2012,3.070063,51.047534,44.122290,0.938727,0.100273,0.721113
196,-33.185361,27.390750,04-09-2014,0.014934,1.516897,97.409968,0.398959,0.305086,0.354156
197,-32.043333,27.822778,28-09-2015,5.760369,8.689623,83.847500,0.524834,0.000000,1.177675
198,-33.001667,25.161389,08-01-2015,0.234682,0.089606,56.012118,0.000000,0.000000,43.663594
