# Path

In [None]:
from pathlib import Path

INPUT_DIR = Path("../data/sample_splits")
OUTPUT_DIR = Path("../models")

SAMPLES_FN = "samples.split_{split}.frac_{frac}.train.pq"

# Training

In [2]:
import time
import joblib
import numpy as np
import pandas as pd

import xgboost as xgb

import lightgbm as lgb

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [3]:
N_SPLITS = 5
RANDOM_STATE = 0

TARGET_COLUMN = 'class'
SPATIAL_CROSS_VALIDATION_COLUMN = 'tile_id'

class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n:02}' for n in range(1, 65)]

In [4]:
def target_ovo(samples: pd.DataFrame, class_name: str, class_a: list[int], class_b: list[int]):
    remap_dict = {}
    
    remap_dict.update({val: 0.0 for val in class_a})
    remap_dict.update({val: 1.0 for val in class_b})
    
    samples[class_name] = samples[TARGET_COLUMN].map(remap_dict)


def create_ovo_class(samples: pd.DataFrame, class_name: list[str], class_values: list[tuple[list[int], list[int]]]):
    class_data = dict(zip(class_name, class_values))
    
    for class_key in class_data:
        value_a = class_data[class_key][0]
        value_b = class_data[class_key][1]
        
        target_ovo(samples, class_key, value_a, value_b)

In [5]:
def split_data_np(data_list: list[str], ratio_a: float = 0.8) -> tuple[np.ndarray, np.ndarray]:
    """
    Separa uma lista de strings em dois arrays NumPy (80% para A, 20% para B).
    """
    data_array = np.array(data_list)
    
    # 2. Calcular o tamanho de A (80%)
    total_size = len(data_array)
    # np.round() garante que o índice seja um número inteiro
    size_a = int(np.round(total_size * ratio_a)) 
    
    # 3. Criar uma permutação aleatória dos índices
    # Isso garante que a divisão seja aleatória (evitando vieses)
    indices = np.arange(total_size)
    np.random.shuffle(indices)
    
    # 4. Dividir os índices
    indices_a = indices[:size_a]
    indices_b = indices[size_a:]
    
    # 5. Aplicar os índices para obter os arrays
    array_a = data_array[indices_a]
    array_b = data_array[indices_b]
    
    return array_a, array_b

## Random Forest

In [6]:
def get_estimator():
    return RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 'training_time': time.time() - t_start}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for frac in [10, 20, 30, 40]:
    for target_column in class_name:
        filename = f'rf.frac_{frac}.{target_column}.lz4'

        if (OUTPUT_DIR / filename).exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(split=n_fold, frac=frac))

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = random_forest(samples, target_column, covariates)

            model['#_fold'] = n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / filename, compress='lz4')

## XGBoost

In [None]:
def get_estimator():
    return xgb.XGBClassifier(n_jobs=-1, objective='binary:logistic', booster='gbtree', eval_metric='mlogloss', random_state=RANDOM_STATE)


def xgb(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 'training_time': time.time() - t_start}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for target_column in class_name:
    filename = f'xgb.{target_column}.lz4'

    if (OUTPUT_DIR / filename).exists():
        continue

    models = []

    for n_fold in [f'{n:02}' for n in range(1, 6)]:
        samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(split=n_fold, frac=20))

        create_ovo_class(samples, class_name, class_values)

        samples = samples[np.logical_not(np.isnan(samples[target_column]))]

        model = xgb(samples, target_column, covariates)

        model['#_fold'] = n_fold

        models.append(model)

    joblib.dump(models, OUTPUT_DIR / filename, compress='lz4')

## LightGBM

In [None]:
def get_estimator():
    return lgb.LGBMClassifier(n_jobs=-1, random_state=RANDOM_STATE)


def lgbm(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 'training_time': time.time() - t_start}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for target_column in class_name:
    filename = f'lgbm.{target_column}.lz4'

    if (OUTPUT_DIR / filename).exists():
        continue

    models = []

    for n_fold in [f'{n:02}' for n in range(1, 6)]:
        samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(split=n_fold, frac=20))

        create_ovo_class(samples, class_name, class_values)

        samples = samples[np.logical_not(np.isnan(samples[target_column]))]

        model = lgbm(samples, target_column, covariates)

        model['#_fold'] = n_fold

        models.append(model)

    joblib.dump(models, OUTPUT_DIR / filename, compress='lz4')

[LightGBM] [Info] Number of positive: 622906, number of negative: 4399646
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.191535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10363
[LightGBM] [Info] Number of data points in the train set: 5022552, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.124022 -> initscore=-1.954884
[LightGBM] [Info] Start training from score -1.954884
[LightGBM] [Info] Number of positive: 624842, number of negative: 4438208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.230727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10374
[LightGBM] [Info] Number of data points in the train set: 5063050, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.123412 -> initscore=-1.960507
[LightGBM] [Info] Start training from score -1.960507


## KNeighborsClassifier

In [None]:
def get_estimator(n_neighbors, metric):
    return KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric)

def knn_classifier(samples: pd.DataFrame, target_column: str, covariates: list[str], n_neighbors=3, metric='minkowski', sample_ratio=0.008):
    x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
            lambda group: group.sample(frac=sample_ratio, random_state=RANDOM_STATE)
        )[covariates]
    y_train = samples.loc[x_train.index][target_column]

    t_start = time.time()

    estimator = get_estimator(n_neighbors, metric)
    estimator.fit(x_train, y_train)

    return {'model': estimator, 'training_time': time.time() - t_start}

In [6]:
for metric in ['minkowski', 'euclidean', 'manhattan', 'cosine']:
    for target_column in class_name:
        if (OUTPUT_DIR / f'knn.m_{metric}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = knn_classifier(samples, target_column, covariates, metric=metric)

            model['#_fold'] = n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / f'knn.m_{metric}.{target_column}.lz4', compress='lz4')

  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(

In [None]:
for n_neighbors in [1, 3]:
    for target_column in class_name:
        if (OUTPUT_DIR / f'knn.nn_{n_neighbors}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = knn_classifier(samples, target_column, covariates, n_neighbors=n_neighbors)

            model['#_fold'] = n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / f'knn.nn_{n_neighbors}.{target_column}.lz4', compress='lz4')

In [None]:
# TODO: Identify best model hyperparameters and retrain only that configuration.

# Best model: model trained with n_neighbors=3 and metric='minkowski', observed to be the best performing configuration when isolated.

for target_column in class_name:
    if (OUTPUT_DIR / f'knn.nn_{n_neighbors}.{target_column}.lz4').exists():
        continue

    models = []

    for n_fold in [f'{n:02}' for n in range(1, 6)]:
        samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

        create_ovo_class(samples, class_name, class_values)

        samples = samples[np.logical_not(np.isnan(samples[target_column]))]

        model = knn_classifier(samples, target_column, covariates, n_neighbors=3, metrics='')

        model['#_fold'] = n_fold

        models.append(model)

    joblib.dump(models, OUTPUT_DIR / f'knn.nn_{n_neighbors}.{target_column}.lz4', compress='lz4')

## SVC

Para treinamento em quantidades maiores de amostras sera importante migrar para abordagens paralelas como propostas pelo framework [cuML SVM](https://medium.com/rapids-ai/fast-support-vector-classification-with-rapids-cuml-6e49f4a7d89e).

In [None]:
def get_estimator(kernel="linear"):
    return SVC(kernel=kernel, probability=True, random_state=RANDOM_STATE)


def linear_svc(samples: pd.DataFrame, target_column: str, covariates: list[str], kernel="linear", sample_ratio=0.01):
    x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
            lambda group: group.sample(frac=sample_ratio, random_state=RANDOM_STATE)
        )[covariates]
    y_train = samples.loc[x_train.index][target_column]

    t_start = time.time()

    estimator = get_estimator(kernel)
    estimator.fit(x_train, y_train)

    return {'model': estimator, 'training_time': time.time() - t_start}

In [None]:
# metrics = ['linear', 'poly', 'rbf']

class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

for kernel in ['linear']:
    for target_column in class_name:
        if (OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = linear_svc(samples, target_column, covariates, kernel)

            model['#_fold'] =  n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4', compress='lz4')

  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(


# LogisticRegression

In [None]:
def get_estimator():
    return LogisticRegression(n_jobs=-1, random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 'training_time': time.time() - t_start}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

for kernel in ['linear', 'poly', 'rbf']:
    for target_column in class_name:
        if (OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN)

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = linear_svc(samples, target_column, covariates, kernel)

            model['#_fold'] =  n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4', compress='lz4')

## Neural Network

In [6]:
import tensorflow as tf

print(f"Versão do TensorFlow: {tf.__version__}")

# Lista os dispositivos físicos que o TensorFlow pode usar
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print(f"Número de GPUs encontradas: {len(gpus)}")
    print(f"Detalhes: {gpus}")
    
    try:
        # Tenta alocar memória na primeira GPU para confirmar que está funcional
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("✅ GPU está pronta para uso (memory growth ativado).")
    except RuntimeError as e:
        print(f"❌ Erro ao inicializar a GPU: {e}")
        
else:
    print("❌ NENHUMA GPU compatível foi encontrada pelo TensorFlow.")
    print("O modelo irá treinar usando a CPU (muito mais lento).")

2025-11-12 15:33:26.810542: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-12 15:33:26.818240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762972406.827880    2272 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762972406.831672    2272 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762972406.839884    2272 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Versão do TensorFlow: 2.19.1
Número de GPUs encontradas: 1
Detalhes: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
✅ GPU está pronta para uso (memory growth ativado).


In [7]:
import tensorflow as tf
from keras import layers, models, optimizers
from keras.callbacks import EarlyStopping

# Modelo sequencial com uma única camada densa
# melhor modelo com 7 layers 256, activation='relu' e dropout=0.4; early_stopping -> default com 3 espera. Sempre apenas um epoch.
model = models.Sequential([
    layers.Dense(512, input_shape=(64,), activation='sigmoid'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(512, activation='sigmoid'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(512, activation='sigmoid'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(512, activation='sigmoid'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(1, activation='sigmoid')
])

# Compila o modelo
model.compile(
    optimizer='adam',
    loss='crossentropy',
    metrics=['precision', 'recall']
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1762972409.029788    2272 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13065 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 5080, pci bus id: 0000:02:00.0, compute capability: 12.0


In [8]:
samples_train = pd.read_parquet(INPUT_DIR / 'samples.split_01.frac_20.train.pq')

create_ovo_class(samples_train, class_name, class_values)

samples_train = samples_train[np.logical_not(np.isnan(samples_train['oxc']))]

In [9]:
samples_valid = pd.read_parquet(INPUT_DIR / 'samples.split_01.test.pq')

create_ovo_class(samples_valid, class_name, class_values)

samples_valid = samples_valid[np.logical_not(np.isnan(samples_valid['oxc']))]

In [10]:
early_stopping_monitor = EarlyStopping(
    monitor='val_precision',     # Métrica a monitorizar
    min_delta=0.05,
    patience=10,            # Número de épocas sem melhoria antes de parar
    verbose=1,              # Imprime uma mensagem quando para
    mode='max',             # 'min' porque a perda (loss) deve minimizar
    restore_best_weights=True # Restaura os melhores pesos encontrados
)

model.fit(samples_train[covariates], samples_train['oxc'], epochs=50,
            batch_size=8192, callbacks=[early_stopping_monitor],
            validation_data=(samples_valid[covariates], samples_valid['oxc']))

Epoch 1/50


I0000 00:00:1762972427.154476    2377 service.cc:152] XLA service 0x767fb40099c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1762972427.154504    2377 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 5080, Compute Capability 12.0
2025-11-12 15:33:47.184483: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1762972427.363750    2377 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m 14/772[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 4ms/step - loss: 0.8955 - precision: 0.1676 - recall: 0.6456  

I0000 00:00:1762972428.678224    2377 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - loss: 0.2373 - precision: 0.6064 - recall: 0.5789 - val_loss: 0.2258 - val_precision: 0.6356 - val_recall: 0.6492
Epoch 2/50
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.1669 - precision: 0.7753 - recall: 0.6487 - val_loss: 0.2242 - val_precision: 0.6823 - val_recall: 0.5868
Epoch 3/50
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 0.1515 - precision: 0.7968 - recall: 0.6873 - val_loss: 0.2348 - val_precision: 0.6848 - val_recall: 0.5568
Epoch 4/50
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.1411 - precision: 0.8121 - recall: 0.7129 - val_loss: 0.2400 - val_precision: 0.6939 - val_recall: 0.5557
Epoch 5/50
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.1330 - precision: 0.8228 - recall: 0.7319 - val_loss: 0.2463 - val_precision: 0.6911 - val_recall: 0.5551

<keras.src.callbacks.history.History at 0x7682860872f0>