# TODO

[ ] Avaliar o custo de tempo para inferencia de cada modelo.

# Path

In [None]:
from pathlib import Path

INPUT_DIR = Path("../data")
OUTPUT_DIR = Path("../models")

SAMPLES_FN = "{n_fold}_samples_train.pq"

# Training

In [None]:
import time
import joblib
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [3]:
N_SPLITS = 5
RANDOM_STATE = 0

TARGET_COLUMN = 'class'
SPATIAL_CROSS_VALIDATION_COLUMN = 'tile_id'

In [None]:
def target_ovo(samples: pd.DataFrame, class_name: str, class_a: list[int], class_b: list[int]):
    remap_dict = {}
    
    remap_dict.update({val: 0.0 for val in class_a})
    remap_dict.update({val: 1.0 for val in class_b})
    
    samples[class_name] = samples[TARGET_COLUMN].map(remap_dict)


def create_ovo_class(samples: pd.DataFrame, class_name: list[str], class_values: list[tuple[list[int], list[int]]]):
    class_data = dict(zip(class_name, class_values))
    
    for class_key in class_data:
        value_a = class_data[class_key][0]
        value_b = class_data[class_key][1]
        
        target_ovo(samples, class_key, value_a, value_b)

In [5]:
def get_optimal_threshold(y_true: pd.DataFrame, y_pred):
    precision, recall, threshold = precision_recall_curve(y_true, y_pred)
    
    nonzero_mask = np.logical_and((precision != 0.0), (recall != 0.0))
    
    optimal_idx = np.argmax(1 - np.abs(precision[nonzero_mask] - recall[nonzero_mask]))
    
    return threshold[optimal_idx]

## Baseline

In [None]:
def get_estimator():
    return RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    X = samples[covariates]
    y = samples[target_column]
    
    estimator = get_estimator()

    cv_result = cross_val_predict(
        estimator, X, y,
        method='predict_proba',
        cv=GroupKFold(N_SPLITS),
        groups=samples[SPATIAL_CROSS_VALIDATION_COLUMN],
        verbose=True,
        n_jobs=-1,
    )

    op_threshold = get_optimal_threshold(y, cv_result[:,1])

    y_pred = (cv_result[:, 1] >= op_threshold).astype(int)

    joblib.dump({
        'cv_result': pd.DataFrame({
            'predict_proba': cv_result[:,1],
            'expected': y.to_numpy(),
        }),
        'threshold': op_threshold,
        'recall': recall_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'f1_score': f1_score(y, y_pred)
    }, OUTPUT_DIR / f'bl_{target_column}.lz4', compress='lz4')

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

for target_column in class_name:
    samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN)

    create_ovo_class(samples, class_name, class_values)

    samples = samples[np.logical_not(np.isnan(samples[target_column]))]

    random_forest(samples, target_column, covariates)

## Random Forest

In [None]:
def get_estimator():
    return RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for target_column in class_name:
    models = []

    for n_fold in [f'{n:02}' for n in range(1, 6)]:
        if (OUTPUT_DIR / f'rf_{target_column}.lz4').exists():
            continue

        samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

        create_ovo_class(samples, class_name, class_values)

        samples = samples[np.logical_not(np.isnan(samples[target_column]))]

        model = random_forest(samples, target_column, covariates)

        model['#_fold'] = n_fold

        models.append(model)

    joblib.dump(model, OUTPUT_DIR / f'rf.{target_column}.lz4', compress='lz4')

## KNeighborsClassifier

In [None]:
def get_estimator(n_neighbors=1, metric='euclidean'):
    return KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, random_state=RANDOM_STATE)

def knn_classifier(samples: pd.DataFrame, target_column: str, covariates: list[str], n_neighbors, metric, sample_ratio=0.001):
    x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
            lambda group: group.sample(frac=sample_ratio, random_state=RANDOM_STATE)
        )[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator(n_neighbors, metric)
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for n_neighbors in [1, 3]:
    for sample_ratio in [0.002, 0.004, 0.008]:
        for metric in ['euclidean', 'manhattan', 'cosine']:
            for target_column in class_name:
                if (OUTPUT_DIR / f'knn.nn_{n_neighbors}.sr_{int(sample_ratio*1000)}.m_{metric}.{target_column}.lz4').exists():
                    continue

                models = []

                for n_fold in [f'{n:02}' for n in range(1, 6)]:
                    samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

                    create_ovo_class(samples, class_name, class_values)

                    samples = samples[np.logical_not(np.isnan(samples[target_column]))]

                    model = knn_classifier(samples, target_column, covariates, n_neighbors, metric, sample_ratio)

                    model['#_fold'] = n_fold

                    models.append(model)

                joblib.dump(model, OUTPUT_DIR / f'knn.nn_{n_neighbors}.sr_{int(sample_ratio*1000)}.m_{metric}.{target_column}.lz4', compress='lz4')

  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(


## SVC

Para treinamento em quantidades maiores de amostras sera importante migrar para abordagens paralelas como propostas pelo framework [cuML SVM](https://medium.com/rapids-ai/fast-support-vector-classification-with-rapids-cuml-6e49f4a7d89e).

In [None]:
def get_estimator(kernel="linear"):
    return SVC(kernel=kernel, probability=True, random_state=RANDOM_STATE)


def linear_svc(samples: pd.DataFrame, target_column: str, covariates: list[str], kernel="linear", sample_ratio=0.01):
    x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
            lambda group: group.sample(frac=sample_ratio, random_state=RANDOM_STATE)
        )[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator(kernel)
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for kernel in ['linear', 'poly', 'rbf']:
    for target_column in class_name:
        if (OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN)

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = linear_svc(samples, target_column, covariates, kernel, sample_ratio)

            model['#_fold'] =  n_fold

            models.append(model)

        joblib.dump(model, OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4', compress='lz4')

  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train_idx].groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.iloc[train

# LogisticRegression

In [None]:
def get_estimator():
    return LogisticRegression(n_jobs=-1, random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for kernel in ['linear', 'poly', 'rbf']:
    for target_column in class_name:
        if (OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN)

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = linear_svc(samples, target_column, covariates, kernel, sample_ratio)

            model['#_fold'] =  n_fold

            models.append(model)

        joblib.dump(model, OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4', compress='lz4')

## Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Modelo sequencial com uma única camada densa
model = models.Sequential([
    layers.Dense(512, input_shape=(64,), activation='relu'),
    layers.Dense(2, activation='softmax')
])

# Compila o modelo
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Exibe o resumo do modelo
model.summary()