# Extraction of ESA World cover data ( soil ) % crops, % urban area, % woods

In [3]:
import warnings
warnings.filterwarnings("ignore")

import planetary_computer as pc
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.spatial import cKDTree

import pystac_client
import planetary_computer as pc
import requests
import time

from odc.stac import stac_load

from datetime import date
from tqdm import tqdm
import os

tqdm.pandas()

## Running only Unique coords

In [4]:


def get_lulc_features(row):
    max_retries = 3
    try:
        for attempt in range(max_retries):
            try:
                lat, lon = row['Latitude'], row['Longitude']
                catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
                
                # 1. Definimos um buffer de aprox. 1km ao redor do ponto (0.009 graus)
                # É importante ver o entorno para saber se há fazendas ou cidades perto
                buffer = 0.009 
                bbox = [lon - buffer, lat - buffer, lon + buffer, lat + buffer]
                
                search = catalog.search(collections=["esa-worldcover"], bbox=bbox)
                items = list(search.get_items())
                
                if not items:
                    return pd.Series({'pct_agriculture': np.nan, 'pct_urban': np.nan, 'pct_natural': np.nan})
                    
                # 2. Carregar o mapa de classes (banda "map")
                selected_item = items[0]
                data = stac_load(
                    [selected_item], 
                    bands=["map"], 
                    bbox=bbox, 
                    chunks={},
                    patch_url=pc.sign_url
                ).compute().isel(time=0)
                
                lulc = data["map"].values
                total_pixels = lulc.size
                
                # 3. Calcular porcentagens das classes de interesse
                # Classe 40 = Cropland (Agricultura)
                pct_agri = np.sum(lulc == 40) / total_pixels * 100
                
                # Classe 50 = Built-up (Urbano)
                pct_urban = np.sum(lulc == 50) / total_pixels * 100
                
                # Classes 10, 20, 30 = Vegetação Natural (Trees, Shrubland, Grassland)
                pct_natural = np.sum(np.isin(lulc, [10, 20, 30])) / total_pixels * 100
                
                return pd.Series({
                    'pct_agriculture': float(pct_agri),
                    'pct_urban': float(pct_urban),
                    'pct_natural': float(pct_natural)
                })
            except Exception as e:
                if attempt <= max_retries:
                    print(f"Erro ao processar ponto ({lat}, {lon}): {e}")
                    print("tentando novamente...")
                    time.sleep(3 ** attempt)  # Espera antes de tentar novamente
                time.sleep(2)  # Espera antes de tentar novamente

    except Exception as e:
        print(f"Erro ao processar ponto ({lat}, {lon}) (tentativa {attempt + 1}): {e}")
        return pd.Series({'pct_agriculture': np.nan, 'pct_urban': np.nan, 'pct_natural': np.nan})

## Training data

In [16]:
Water_Quality_df = pd.read_csv('water_quality_training_dataset.csv')
unique_coords = Water_Quality_df[['Latitude', 'Longitude']].drop_duplicates().reset_index(drop=True)
unique_coords.shape

(162, 2)

In [17]:



unique_10 = unique_coords[0:10]  # Usando apenas os últimos 10 registros para teste rápido
unique_10

Unnamed: 0,Latitude,Longitude
0,-28.760833,17.730278
1,-26.861111,28.884722
2,-26.45,28.085833
3,-27.671111,27.236944
4,-27.356667,27.286389
5,-27.010111,26.698083
6,-25.127778,27.628889
7,-25.20639,27.558
8,-24.69514,27.40906
9,-26.984722,26.632278


In [18]:
lulc_results = unique_coords.progress_apply(get_lulc_features, axis=1)
lulc_final_df = pd.concat([unique_coords, lulc_results], axis=1)

lulc_final_df


100%|██████████| 162/162 [04:44<00:00,  1.75s/it]


Unnamed: 0,Latitude,Longitude,pct_agriculture,pct_urban,pct_natural
0,-28.760833,17.730278,0.000000,0.000000,12.073306
1,-26.861111,28.884722,43.965682,0.628597,55.248572
2,-26.450000,28.085833,33.746999,1.914009,59.443587
3,-27.671111,27.236944,4.635902,42.714859,47.679925
4,-27.356667,27.286389,33.081584,1.169141,65.740741
...,...,...,...,...,...
157,-32.601389,18.750556,47.187241,0.862197,51.795536
158,-28.308889,31.902500,0.151476,0.008534,96.516044
159,-24.958611,29.395278,7.613243,2.406082,86.540381
160,-26.619444,27.980833,40.337515,5.572623,53.908517


In [30]:
train_df = pd.read_csv('water_quality_training_dataset.csv').drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'])

train_df_final = pd.merge(
    train_df, 
    lulc_final_df, 
    on=['Latitude', 'Longitude'], 
    how='left'
)

train_df_final.to_csv("../Datasets/soil_use_data_training.csv", index=False)
train_df_final



Unnamed: 0,Latitude,Longitude,Sample Date,pct_agriculture,pct_urban,pct_natural
0,-28.760833,17.730278,02-01-2011,0.000000,0.000000,12.073306
1,-26.861111,28.884722,03-01-2011,43.965682,0.628597,55.248572
2,-26.450000,28.085833,03-01-2011,33.746999,1.914009,59.443587
3,-27.671111,27.236944,03-01-2011,4.635902,42.714859,47.679925
4,-27.356667,27.286389,03-01-2011,33.081584,1.169141,65.740741
...,...,...,...,...,...,...
9314,-27.527500,30.858056,23-12-2015,8.378136,0.804318,90.399386
9315,-26.861111,28.884722,23-12-2015,43.965682,0.628597,55.248572
9316,-26.984722,26.632278,23-12-2015,42.536473,5.222026,52.152307
9317,-27.935000,26.126667,23-12-2015,28.079990,0.199331,70.994084


## getting validation data

In [24]:
validation_template = pd.read_csv('submission_template.csv')
unique_submission_coords = validation_template[['Latitude', 'Longitude']].drop_duplicates().reset_index(drop=True)
unique_submission_coords.shape

(24, 2)

In [None]:
lulc_results_validation = unique_submission_coords.progress_apply(get_lulc_features, axis=1)


 71%|███████   | 17/24 [00:29<00:11,  1.70s/it]

Erro ao processar ponto (-33.001667, 25.161389): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-33.001667, 25.161389): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-33.001667, 25.161389): Read failed. See previous exception for details.
tentando novamente...


 75%|███████▌  | 18/24 [00:54<00:51,  8.66s/it]

Erro ao processar ponto (-33.094444, 25.012778): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-33.094444, 25.012778): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-33.094444, 25.012778): Read failed. See previous exception for details.
tentando novamente...


 79%|███████▉  | 19/24 [01:19<01:07, 13.55s/it]

Erro ao processar ponto (-34.0325, 24.196389): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-34.0325, 24.196389): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-34.0325, 24.196389): Read failed. See previous exception for details.
tentando novamente...


 83%|████████▎ | 20/24 [01:44<01:08, 17.07s/it]

Erro ao processar ponto (-32.761111, 26.629444): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-32.761111, 26.629444): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-32.761111, 26.629444): Read failed. See previous exception for details.
tentando novamente...


 88%|████████▊ | 21/24 [02:09<00:58, 19.47s/it]

Erro ao processar ponto (-33.506389, 26.744722): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-33.506389, 26.744722): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-33.506389, 26.744722): Read failed. See previous exception for details.
tentando novamente...


 92%|█████████▏| 22/24 [02:34<00:42, 21.21s/it]

Erro ao processar ponto (-32.173889, 27.3725): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-32.173889, 27.3725): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-32.173889, 27.3725): Read failed. See previous exception for details.
tentando novamente...


 96%|█████████▌| 23/24 [03:01<00:22, 22.69s/it]

Erro ao processar ponto (-32.713889, 26.296667): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-32.713889, 26.296667): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-32.713889, 26.296667): Read failed. See previous exception for details.
tentando novamente...


100%|██████████| 24/24 [03:26<00:00, 23.48s/it]

Erro ao processar ponto (-33.23778, 26.99472): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-33.23778, 26.99472): Read failed. See previous exception for details.
tentando novamente...
Erro ao processar ponto (-33.23778, 26.99472): Read failed. See previous exception for details.
tentando novamente...


100%|██████████| 24/24 [03:52<00:00,  9.67s/it]


NameError: name 'unique__submission_coords' is not defined

In [26]:

lulc_final_validation_df = pd.concat([unique_submission_coords, lulc_results_validation], axis=1)

lulc_final_validation_df

Unnamed: 0,Latitude,Longitude,pct_agriculture,pct_urban,pct_natural
0,-32.043333,27.822778,5.760369,8.689623,83.8475
1,-33.329167,26.0775,0.0,0.5637,99.434156
2,-32.991639,27.640028,0.019113,0.155026,3.512498
3,-34.096389,24.439167,16.156767,0.0,83.828298
4,-32.000556,28.581667,0.887523,0.232548,89.782813
5,-32.08639,25.57556,43.109856,0.180509,54.528658
6,-33.185361,27.39075,0.014934,1.516897,97.409968
7,-33.731111,24.618333,0.004267,0.025602,99.750384
8,-31.905,25.43,47.320816,1.363169,48.756859
9,-32.515278,28.015556,3.191828,0.014865,92.269957


In [29]:
validation_df = pd.read_csv('submission_template.csv').drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'])

validation_df_final = pd.merge(
    validation_df, 
    lulc_final_validation_df, 
    on=['Latitude', 'Longitude'], 
    how='left'
)

validation_df_final.to_csv("../Datasets/soil_use_data_validation.csv",index=False)
validation_df_final

Unnamed: 0,Latitude,Longitude,Sample Date,pct_agriculture,pct_urban,pct_natural
0,-32.043333,27.822778,01-09-2014,5.760369,8.689623,83.847500
1,-33.329167,26.077500,16-09-2015,0.000000,0.563700,99.434156
2,-32.991639,27.640028,07-05-2015,0.019113,0.155026,3.512498
3,-34.096389,24.439167,07-02-2012,16.156767,0.000000,83.828298
4,-32.000556,28.581667,01-10-2014,0.887523,0.232548,89.782813
...,...,...,...,...,...,...
195,-33.771111,25.386667,06-12-2012,3.070063,51.047534,44.122290
196,-33.185361,27.390750,04-09-2014,0.014934,1.516897,97.409968
197,-32.043333,27.822778,28-09-2015,5.760369,8.689623,83.847500
198,-33.001667,25.161389,08-01-2015,,,
