# Carregar Dados

In [None]:
import os
import pandas as pd

MODELS_PATH = "C:/Users/tiago/Documents/Workspace/UFG/pfc-cc-2025/models"

SAMPLES_PATH = "C:/Users/tiago/Documents/Workspace/UFG/pfc-cc-2025/data"

# Misc

In [None]:
import os

class PathHandler():
    __value: str = ''
    
    @classmethod
    def generate_path(cls, file_name: str):
        return f'{MODELS_PATH}/{file_name}_{cls.__value}.lz4'
        
    @classmethod
    def set_value(cls, value: str):
        cls.__value = value

# Treinamento Modelos

In [None]:
import sys
import joblib
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from scipy.signal import argrelmin
from scipy.stats import uniform, randint

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, KFold, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [None]:
TARGET_COLUMN = 'class'

COVARIATE_START_COLUMN = 'ml_type'
SPATIAL_CROSS_VALIDATION_COLUMN = 'ml_cv_group'

CROSS_VALIDATION_NJOBS, CROSS_VALIDATION_FOLDS = 5, 5

RANDOM_STATE = 1989

In [8]:
def target_ovo(samples: pd.DataFrame, class_name: str, class_a: list[int], class_b: list[int]):
    remap_dict = {}
    
    remap_dict.update({val: 0 for val in class_a})
    remap_dict.update({val: 1 for val in class_b})
    
    samples[class_name] = samples[TARGET_COLUMN].map(remap_dict)


def create_ovo_class(samples: pd.DataFrame, class_name: list[str], class_values: list[tuple[list[int], list[int]]]):
    class_data = dict(zip(class_name, class_values))
    
    for class_key in class_data:
        value_a = class_data[class_key][0]
        value_b = class_data[class_key][1]
        
        target_ovo(samples, class_key, value_a, value_b)

## Random Forest

In [9]:
def get_optimal_threshold(y_true: pd.DataFrame, y_pred):
    precision, recall, threshold = precision_recall_curve(y_true, y_pred)
    
    nonzero_mask = np.logical_and((precision != 0.0), (recall != 0.0))
    
    optimal_idx = np.argmax(1 - np.abs(precision[nonzero_mask] - recall[nonzero_mask]))
    
    return threshold[optimal_idx]

In [None]:
def get_estimator():
    return RandomForestClassifier(n_jobs=-1)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    tc_samples = samples[np.logical_not(np.isnan(samples[target_column]))]

    X = tc_samples[covariates]
    y = tc_samples[target_column]
    
    estimator = get_estimator()

    cv_result = cross_val_predict(
        estimator, X, y,
        method='predict_proba',
        cv=GroupKFold(CROSS_VALIDATION_FOLDS),
        groups=tc_samples[SPATIAL_CROSS_VALIDATION_COLUMN],
        verbose=False
        n_jobs=-1,
    )

    estimator.fit(X, y)

    op_threshold = get_optimal_threshold(y, cv_result[:,1])

    y_pred = (cv_result[:, 1] >= op_threshold).astype(int)

    joblib.dump({
        'cv_result': pd.DataFrame({
            'predict_proba': cv_result[:,1],
            'expected': y.to_numpy(),
        }),
        'threshold': op_threshold,
        'recall': recall_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'f1_score': f1_score(y, y_pred),
        'model': estimator,
    }, os.path.join(), compress='lz4')

### AlphaEarth Embeddings

In [None]:
class_name = ['other_vs_cultivated', 'other_vs_natural']
class_values = [([3], [1]), ([3], [2])]

PathHandler.set_path(f'')

samples = pd.read_parquet(os.path.join(SAMPLES_PATH, "alphaearth_train_samples.parquet"))

covariates = list(samples.columns)[2:]

create_ovo_class(samples, class_name, class_values)

for target_column in class_name:
    PathHandler.set_value(target_column)

    random_forest(samples, target_column, covariates)

: 

### Landsat

In [None]:
class_name = ['other_vs_cultivated', 'other_vs_natural']
class_values = [([3], [1]), ([3], [2])]

PathHandler.set_path(f'')

samples = pd.read_parquet(os.path.join(SAMPLES_PATH, "landsat_train_samples.parquet"))

covariates = list(samples.columns)[2:]

create_ovo_class(samples, class_name, class_values)

for target_column in class_name:
    PathHandler.set_value(target_column)

    random_forest(samples, target_column, covariates)

# END