# Carregar Dados

In [1]:
from pathlib import Path

INPUT_DIR = Path("../data")
OUTPUT_DIR = Path("../models")

SAMPLES_FN = "samples.pq"

# Misc

In [2]:
import os

class PathHandler():
    __value: str = ''
    
    @classmethod
    def generate_path(cls, file_name: str):
        return OUTPUT_DIR / f"{file_name}_{cls.__value}.lz4"
        
    @classmethod
    def set_value(cls, value: str):
        cls.__value = value

# Treinamento Modelos

In [3]:
import joblib

import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, KFold, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [4]:
TARGET_COLUMN = 'class'

SPATIAL_CROSS_VALIDATION_COLUMN = 'tile_id'

CROSS_VALIDATION_NJOBS, CROSS_VALIDATION_FOLDS = 5, 5

RANDOM_STATE = 1989

In [5]:
def target_ovo(samples: pd.DataFrame, class_name: str, class_a: list[int], class_b: list[int]):
    remap_dict = {}
    
    remap_dict.update({val: 0.0 for val in class_a})
    remap_dict.update({val: 1.0 for val in class_b})
    
    samples[class_name] = samples[TARGET_COLUMN].map(remap_dict)


def create_ovo_class(samples: pd.DataFrame, class_name: list[str], class_values: list[tuple[list[int], list[int]]]):
    class_data = dict(zip(class_name, class_values))
    
    for class_key in class_data:
        value_a = class_data[class_key][0]
        value_b = class_data[class_key][1]
        
        target_ovo(samples, class_key, value_a, value_b)

## Random Forest

In [6]:
def get_optimal_threshold(y_true: pd.DataFrame, y_pred):
    precision, recall, threshold = precision_recall_curve(y_true, y_pred)
    
    nonzero_mask = np.logical_and((precision != 0.0), (recall != 0.0))
    
    optimal_idx = np.argmax(1 - np.abs(precision[nonzero_mask] - recall[nonzero_mask]))
    
    return threshold[optimal_idx]

In [7]:
def get_estimator():
    return RandomForestClassifier(n_jobs=-1)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    tc_samples = samples[np.logical_not(np.isnan(samples[target_column]))]

    X = tc_samples[covariates]
    y = tc_samples[target_column]
    
    estimator = get_estimator()

    cv_result = cross_val_predict(
        estimator, X, y,
        method='predict_proba',
        cv=GroupKFold(CROSS_VALIDATION_FOLDS),
        groups=tc_samples[SPATIAL_CROSS_VALIDATION_COLUMN],
        verbose=False,
        n_jobs=-1,
    )

    estimator.fit(X, y)

    op_threshold = get_optimal_threshold(y, cv_result[:,1])

    y_pred = (cv_result[:, 1] >= op_threshold).astype(int)

    joblib.dump({
        'cv_result': pd.DataFrame({
            'predict_proba': cv_result[:,1],
            'expected': y.to_numpy(),
        }),
        'threshold': op_threshold,
        'recall': recall_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'f1_score': f1_score(y, y_pred),
        'model': estimator,
    }, OUTPUT_DIR / f'{target_column}.lz4', compress='lz4')

### AlphaEarth Embeddings

In [8]:
class_name = ['other_vs_cultivated', 'other_vs_natural']
class_values = [([0], [1]), ([0], [2])]

samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN)

samples = samples[~samples['is_valid']].copy()

covariates = [f'B{n + 1}' for n in range(64)]

In [9]:
create_ovo_class(samples, class_name, class_values)

In [10]:
for target_column in class_name:
    PathHandler.set_value(target_column)

    random_forest(samples, target_column, covariates)

### Landsat

In [None]:
class_name = ['other_vs_cultivated', 'other_vs_natural']
class_values = [([3], [1]), ([3], [2])]

PathHandler.set_path(f'')

samples = pd.read_parquet(os.path.join(SAMPLES_PATH, "landsat_train_samples.parquet"))

covariates = list(samples.columns)[2:]

create_ovo_class(samples, class_name, class_values)

for target_column in class_name:
    PathHandler.set_value(target_column)

    random_forest(samples, target_column, covariates)

## kNN

In [None]:

def knn_classifier(samples: pd.DataFrame, target_column: str, covariates: list[str], save_path: str):
    tc_samples = samples[np.logical_not(np.isnan(samples[target_column]))]

    X = tc_samples[covariates]
    y = tc_samples[target_column]

    estimator = get_estimator()

    cv_result = cross_val_predict(
        estimator, X, y,
        method='predict_proba',
        cv=GroupKFold(CROSS_VALIDATION_FOLDS),
        groups=tc_samples[SPATIAL_CROSS_VALIDATION_COLUMN],
        n_jobs=-1,
        verbose=0
    )

    estimator.fit(X, y)

    op_threshold = get_optimal_threshold(y, cv_result[:, 1])

    y_pred = (cv_result[:, 1] >= op_threshold).astype(int)

    joblib.dump({
        'cv_result': pd.DataFrame({
            'predict_proba': cv_result[:, 1],
            'expected': y.to_numpy(),
        }),
        'threshold': op_threshold,
        'recall': recall_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'f1_score': f1_score(y, y_pred),
        'model': estimator,
    }, os.path.join(save_path, 'knn_model.joblib'), compress='lz4')

In [14]:
import joblib

# Arquivo salvo com joblib.dump(..., compress=('lz4', 3))
arquivo = "C:/Users/tiago/Documents/Workspace/UFG/pfc-cc-2025/models/other_vs_natural.lz4"

# Carregar o objeto do arquivo
dados = joblib.load(arquivo)

print(dados)

Exception ignored in: <_io.BufferedReader>
Traceback (most recent call last):
  File "c:\Users\tiago\anaconda3\envs\gdal\Lib\site-packages\lz4\frame\__init__.py", line 753, in flush
    self._fp.flush()
ValueError: I/O operation on closed file.


{'cv_result':          predict_proba  expected
0                 0.10       0.0
1                 0.02       0.0
2                 0.13       0.0
3                 0.13       0.0
4                 0.02       0.0
...                ...       ...
5574193           0.05       0.0
5574194           0.06       0.0
5574195           0.10       0.0
5574196           0.11       0.0
5574197           0.11       0.0

[5574198 rows x 2 columns], 'threshold': np.float64(0.32), 'recall': 0.6121106125187814, 'precision': 0.6038804474121208, 'f1_score': 0.6079676779442008, 'model': RandomForestClassifier(n_jobs=-1)}
