In [1]:
import pandas as pd
import numpy as np
import os
from HelperFunctions import *

## DADOS terreno

In [5]:
soil_use = pd.read_csv("../Datasets/soil_use_data_training.csv")
terrain_features = pd.read_csv("../Datasets/nasa_terrain_features_training.csv")

## Terrain_features

In [6]:
terrain_features['slope'] = terrain_features['slope'].round(0)
terrain_features['curvature'] = terrain_features['curvature'].round(3)

terrain_features.to_csv("../Datasets/nasa_terrain_features_rounded_training.csv", index=False)

In [10]:
print(len(terrain_features['slope'].round(0).unique()))
print(len(terrain_features['curvature'].round(3).unique()))

18
14


## soil use

In [None]:
percentages = [
    'pct_agri', 'pct_urban', 'pct_natural', 'pct_water', 'pct_wetlands', 'pct_others'
    
]

for percentage in percentages:
    print(percentage)
    soil_use[percentage] = soil_use[percentage].round(0)

soil_use.to_csv("../Datasets/soil_use_data_rounded_training.csv", index=False)

pct_agri
pct_urban
pct_natural
pct_water
pct_wetlands
pct_others


In [11]:
for percentage in percentages:
    print(percentage)
    print(len(soil_use[percentage].round(0).unique()))

pct_agri
47
pct_urban
27
pct_natural
59
pct_water
15
pct_wetlands
4
pct_others
19


In [None]:
landsat_deles = pd.read_csv('../Datasets/landsat_features_training.csv')
landsat_meu = pd.read_csv('../Datasets/landsat_features_more_bands_train.csv')
terraclimate = pd.read_csv('../Datasets/terraclimate_features_more_bands_training.csv')
precip = pd.read_csv('../Datasets/nasa_precip_features_training.csv')
terrain = pd.read_csv('../Datasets/nasa_terrain_features_rounded_training.csv')
soil_use = pd.read_csv('../Datasets/soil_use_data_rounded_training.csv')

In [3]:
# 1. Carregar os datasets (estou usando os nomes que voc√™ forneceu)
datasets = {
    'landsat_meu': pd.read_csv('../Datasets/landsat_features_more_bands_train.csv'),
    'terraclimate': pd.read_csv('../Datasets/terraclimate_features_more_bands_training.csv'),
    'precip': pd.read_csv('../Datasets/nasa_precip_features_training.csv'),
    'terrain': pd.read_csv('../Datasets/nasa_terrain_features_rounded_training.csv'),
    'soil_use': pd.read_csv('../Datasets/soil_use_data_rounded_training.csv')
}

gabarito = pd.read_csv('../Datasets/landsat_features_training.csv')

def verificar_alinhamento(base_df, target_df, name):
    # Verificar se o tamanho √© igual
    if len(base_df) != len(target_df):
        return f"‚ùå {name}: TAMANHO DIFERENTE! ({len(base_df)} vs {len(target_df)})"
    
    # Arredondar para evitar erro de precis√£o de float (6 casas decimais)
    lat_match = np.isclose(base_df['Latitude'], target_df['Latitude'], atol=1e-6)
    lon_match = np.isclose(base_df['Longitude'], target_df['Longitude'], atol=1e-6)
    
    total_desalinhado = np.sum(~(lat_match & lon_match))
    
    if total_desalinhado == 0:
        return f"‚úÖ {name}: Perfeitamente alinhado."
    else:
        primeiro_erro = np.where(~(lat_match & lon_match))[0][0]
        return f"‚ùå {name}: DESALINHADO! {total_desalinhado} linhas n√£o batem. Primeiro erro na linha {primeiro_erro}."

print("üßê Verificando sincronia das linhas com o gabarito:\n")
for nome, df in datasets.items():
    resultado = verificar_alinhamento(gabarito, df, nome)
    print(resultado)

üßê Verificando sincronia das linhas com o gabarito:

‚úÖ landsat_meu: Perfeitamente alinhado.
‚úÖ terraclimate: Perfeitamente alinhado.
‚úÖ precip: Perfeitamente alinhado.
‚úÖ terrain: Perfeitamente alinhado.
‚úÖ soil_use: Perfeitamente alinhado.


## testando helper function

In [2]:
csv_training_files = ['../Datasets/landsat_features_more_bands_train.csv',
                          '../Datasets/terraclimate_features_more_bands_training.csv',
                          '../Datasets/nasa_precip_features_training.csv',
                          '../Datasets/soil_use_data_rounded_training.csv',
                          '../Datasets/nasa_terrain_features_rounded_training.csv',
                          '../Datasets/water_quality_training_dataset.csv']

# 1. Configurar o ambiente de teste
# Carregue o arquivo de alvos original para servir de gabarito
target_gabarito = pd.read_csv('../Datasets/water_quality_training_dataset.csv')

# Instanciar o seu DataOrganizer
target_cols = ['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']
drop_cols = ['tmax', 'tmin', 'pct_urban', 'pct_water', 'pct_wetlands', 'pct_others', 'curvature', 'pet']

organizer = DataOrganizer(target_cols)

# 2. Rodar o carregamento
print("üîÑ Carregando dados no DataOrganizer...")
organizer.load_training_data(csv_training_files, drop_cols, scale=False)

# 3. Extrair os dados processados internamente
full_df = organizer.get_full_training_dataset()
features, targets = organizer.get_training_dataset()

print("\n--- üß™ TESTES DE INTEGRIDADE ---")

# TESTE 1: Verifica√ß√£o de Tamanho
if len(full_df) == len(target_gabarito):
    print(f"‚úÖ Tamanho do Dataset: OK ({len(full_df)} linhas)")
else:
    print(f"‚ùå Tamanho do Dataset: ERRO! ({len(full_df)} vs {len(target_gabarito)})")

# TESTE 2: Verifica√ß√£o de Sincronia de Chaves (Latitude/Longitude)
# Vamos comparar o full_dataset interno com o arquivo original de alvos linha a linha
lat_match = np.isclose(full_df['Latitude'].values, target_gabarito['Latitude'].values, atol=1e-6)
lon_match = np.isclose(full_df['Longitude'].values, target_gabarito['Longitude'].values, atol=1e-6)
desalinhados = np.sum(~(lat_match & lon_match))

if desalinhados == 0:
    print("‚úÖ Sincronia das Chaves (Lat/Lon): OK")
else:
    print(f"‚ùå Sincronia das Chaves (Lat/Lon): ERRO! {desalinhados} linhas desalinhadas.")

# TESTE 3: Verifica√ß√£o de Alinhamento Features -> Targets
# Verifica se o target na linha X do organizer √© o mesmo do arquivo original
target_check = np.isclose(targets['Total Alkalinity'].values, target_gabarito['Total Alkalinity'].values, atol=1e-4)
if np.all(target_check):
    print("‚úÖ Alinhamento Feature -> Target: OK")
else:
    print(f"‚ùå Alinhamento Feature -> Target: ERRO! Os alvos foram misturados.")

# TESTE 4: Vazamento de IDs nas Features
# Verifique se as colunas proibidas realmente sa√≠ram
proibidas = ['Latitude', 'Longitude', 'Sample Date', 'Year', 'MonthOfYear']
detectadas = [col for col in features.columns if col in proibidas]
if not detectadas:
    print("‚úÖ Limpeza de Colunas: OK (Nenhum ID vazou para o treino)")
else:
    print(f"‚ùå Limpeza de Colunas: ERRO! Colunas proibidas detectadas nas features: {detectadas}")

# TESTE 5: Verifica√ß√£o de Ordem das Colunas (Consist√™ncia)
print(f"\nüìã Total de features sendo usadas: {len(features.columns)}")
print(f"Primeiras 5 features: {list(features.columns[:5])}")

üîÑ Carregando dados no DataOrganizer...

--- üß™ TESTES DE INTEGRIDADE ---
‚úÖ Tamanho do Dataset: OK (9319 linhas)
‚úÖ Sincronia das Chaves (Lat/Lon): OK
‚úÖ Alinhamento Feature -> Target: OK
‚úÖ Limpeza de Colunas: OK (Nenhum ID vazou para o treino)

üìã Total de features sendo usadas: 29
Primeiras 5 features: ['nir', 'green', 'swir16', 'swir22', 'coastal']


In [3]:
def audit_data_organizer(organizer, original_target_path):
    print("üîé Iniciando Auditoria de Sequ√™ncia de Coordenadas...")
    
    # 1. Carregar o gabarito original (A ordem real que a plataforma espera)
    df_gabarito = pd.read_csv(original_target_path)
    gabarito_coords = df_gabarito[['Latitude', 'Longitude']].values
    
    # 2. Verificar o dataset interno completo
    full_df = organizer.get_full_training_dataset()
    internal_coords = full_df[['Latitude', 'Longitude']].values
    
    # 3. Verificar o dataset de Features (X) e Targets (Y)
    features, targets_dict = organizer.get_training_dataset()
    # Como as features n√£o t√™m Lat/Lon, usamos o index para buscar no full_df
    x_coords = full_df.loc[features.index, ['Latitude', 'Longitude']].values
    
    # Fun√ß√£o auxiliar para comparar matrizes de coordenadas
    def compare_coords(arr1, arr2, label):
        if arr1.shape != arr2.shape:
            print(f"‚ùå {label}: TAMANHOS DIFERENTES! {arr1.shape} vs {arr2.shape}")
            return False
        
        # Compara com toler√¢ncia para floats
        mismatch = ~np.isclose(arr1, arr2, atol=1e-6).all(axis=1)
        count_errors = np.sum(mismatch)
        
        if count_errors == 0:
            print(f"‚úÖ {label}: Sequ√™ncia id√™ntica.")
            return True
        else:
            first_error = np.where(mismatch)[0][0]
            print(f"‚ùå {label}: DESALINHADO! {count_errors} linhas falharam.")
            print(f"   Primeiro erro no √≠ndice {first_error}:")
            print(f"   Esperado: {arr1[first_error]}, Obtido: {arr2[first_error]}")
            return False

    # Executar compara√ß√µes
    c1 = compare_coords(gabarito_coords, internal_coords, "Gabarito vs Dataset Interno")
    c2 = compare_coords(internal_coords, x_coords, "Dataset Interno vs Features (X)")
    
    # Verificar se os Targets batem individualmente
    for target_name, target_series in targets_dict.items():
        # Como target_series √© uma s√©rie, o index deve bater com o full_df
        target_coords = full_df.loc[target_series.index, ['Latitude', 'Longitude']].values
        compare_coords(internal_coords, target_coords, f"Dataset Interno vs Target ({target_name})")





In [7]:
dataHandler = DataOrganizer(target_cols)
dataHandler.load_training_data(csv_training_files, drop_cols, scale=False)

audit_data_organizer(dataHandler, '../Datasets/water_quality_training_dataset.csv')

üîé Iniciando Auditoria de Sequ√™ncia de Coordenadas...
‚úÖ Gabarito vs Dataset Interno: Sequ√™ncia id√™ntica.
‚úÖ Dataset Interno vs Features (X): Sequ√™ncia id√™ntica.
‚úÖ Dataset Interno vs Target (Total Alkalinity): Sequ√™ncia id√™ntica.
‚úÖ Dataset Interno vs Target (Electrical Conductance): Sequ√™ncia id√™ntica.
‚úÖ Dataset Interno vs Target (Dissolved Reactive Phosphorus): Sequ√™ncia id√™ntica.


## fazendo caracteristicas do solo serem em "packs"

In [50]:
solo = pd.read_csv('../Datasets/soil_use_data_rounded_training.csv')
solo.columns

Index(['Latitude', 'Longitude', 'Sample Date', 'pct_agri', 'pct_urban',
       'pct_natural', 'pct_water', 'pct_wetlands', 'pct_others'],
      dtype='object')

In [51]:
solo.min()

Latitude        -34.405833
Longitude        17.730278
Sample Date     01-01-2013
pct_agri               0.0
pct_urban              0.0
pct_natural            3.0
pct_water              0.0
pct_wetlands           0.0
pct_others             0.0
dtype: object

In [52]:
solo.max()

Latitude        -22.225556
Longitude           32.325
Sample Date     31-12-2015
pct_agri              86.0
pct_urban             60.0
pct_natural          100.0
pct_water             14.0
pct_wetlands          16.0
pct_others            92.0
dtype: object

In [53]:
for coluna in solo.columns:
    if coluna not in ['Latitude', 'Longitude', 'Sample Date']:
        print(coluna)
        solo[coluna] = (solo[coluna] // 10) * 10




pct_agri
pct_urban
pct_natural
pct_water
pct_wetlands
pct_others


In [54]:
print(solo.min())
solo.max()

Latitude        -34.405833
Longitude        17.730278
Sample Date     01-01-2013
pct_agri               0.0
pct_urban              0.0
pct_natural            0.0
pct_water              0.0
pct_wetlands           0.0
pct_others             0.0
dtype: object


Latitude        -22.225556
Longitude           32.325
Sample Date     31-12-2015
pct_agri              80.0
pct_urban             60.0
pct_natural          100.0
pct_water             10.0
pct_wetlands          10.0
pct_others            90.0
dtype: object

In [39]:
for coluna in solo.columns:
    if coluna not in ['Latitude', 'Longitude', 'Sample Date']:
        print(coluna)
        print(len(solo[coluna].unique()))

solo.to_csv('../Datasets/soil_use_data_rounded_in_packs_training.csv', index=False)

pct_agri
9
pct_urban
7
pct_natural
11
pct_water
2
pct_wetlands
2
pct_others
9


## Terrain Features

In [77]:
terrain = pd.read_csv('../Datasets/nasa_terrain_features_rounded_training.csv')

terrain.columns

Index(['Latitude', 'Longitude', 'Sample Date', 'elevation', 'slope',
       'curvature'],
      dtype='object')

In [78]:
terrain.min()

Latitude       -34.405833
Longitude       17.730278
Sample Date    01-01-2013
elevation             5.0
slope                 0.0
curvature          -0.003
dtype: object

In [79]:
terrain.max()

Latitude       -22.225556
Longitude          32.325
Sample Date    31-12-2015
elevation          1594.0
slope                28.0
curvature           0.014
dtype: object

In [80]:
for coluna in terrain.columns:
    if coluna not in ['Latitude', 'Longitude', 'Sample Date']:
        print(coluna)
        print(len(terrain[coluna].unique()))

elevation
148
slope
18
curvature
14


In [81]:

terrain['elevation'] = (terrain['elevation'] // 110) * 110
terrain['slope'] = (terrain['slope'] // 2) * 2
terrain['curvature'] = (terrain['curvature'] // 0.003) * 0.003



print()
print(terrain.min())
print()
print(terrain.max())
print()

for coluna in terrain.columns:
    if coluna not in ['Latitude', 'Longitude', 'Sample Date']:
        print(coluna)
        print(len(terrain[coluna].unique()))


terrain.to_csv('../Datasets/nasa_terrain_features_rounded_in_packs_training.csv', index=False)


Latitude       -34.405833
Longitude       17.730278
Sample Date    01-01-2013
elevation             0.0
slope                 0.0
curvature          -0.003
dtype: object

Latitude       -22.225556
Longitude          32.325
Sample Date    31-12-2015
elevation          1540.0
slope                28.0
curvature           0.012
dtype: object

elevation
15
slope
11
curvature
6
