# Path

In [1]:
from pathlib import Path

INPUT_DIR = Path("../data")
OUTPUT_DIR = Path("../models")

SAMPLES_FN = "{n_fold}_samples_train.pq"

# Training

In [2]:
import time
import joblib
import numpy as np
import pandas as pd

import xgboost as xgb

import lightgbm as lgb

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [3]:
N_SPLITS = 5
RANDOM_STATE = 0

TARGET_COLUMN = 'class'
SPATIAL_CROSS_VALIDATION_COLUMN = 'tile_id'

class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n:02}' for n in range(1, 65)]

In [4]:
def target_ovo(samples: pd.DataFrame, class_name: str, class_a: list[int], class_b: list[int]):
    remap_dict = {}
    
    remap_dict.update({val: 0.0 for val in class_a})
    remap_dict.update({val: 1.0 for val in class_b})
    
    samples[class_name] = samples[TARGET_COLUMN].map(remap_dict)


def create_ovo_class(samples: pd.DataFrame, class_name: list[str], class_values: list[tuple[list[int], list[int]]]):
    class_data = dict(zip(class_name, class_values))
    
    for class_key in class_data:
        value_a = class_data[class_key][0]
        value_b = class_data[class_key][1]
        
        target_ovo(samples, class_key, value_a, value_b)

## Random Forest

In [5]:
def get_estimator():
    return RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [6]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for target_column in class_name:
    if (OUTPUT_DIR / f'rf.{target_column}.lz4').exists():
        continue

    models = []

    for n_fold in [f'{n:02}' for n in range(1, 6)]:
        samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

        create_ovo_class(samples, class_name, class_values)

        samples = samples[np.logical_not(np.isnan(samples[target_column]))]

        model = random_forest(samples, target_column, covariates)

        model['#_fold'] = n_fold

        models.append(model)

    joblib.dump(models, OUTPUT_DIR / f'rf.{target_column}.lz4', compress='lz4')

## XGBoost

In [None]:
def get_estimator():
    return xgb.XGBClassifier(n_jobs=-1, objective='binary:logistic', booster='gbtree', eval_metric='mlogloss', random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for target_column in class_name:
    filename = f'xgb.{target_column}.lz4'

    if (OUTPUT_DIR / filename).exists():
        continue

    models = []

    for n_fold in [f'{n:02}' for n in range(1, 6)]:
        samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

        create_ovo_class(samples, class_name, class_values)

        samples = samples[np.logical_not(np.isnan(samples[target_column]))]

        model = random_forest(samples, target_column, covariates)

        model['#_fold'] = n_fold

        models.append(model)

    joblib.dump(models, OUTPUT_DIR / filename, compress='lz4')

## LightGBM

In [5]:
def get_estimator():
    return lgb.LGBMClassifier(n_jobs=-1, random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [6]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

covariates = [f'B{n + 1:02}' for n in range(64)]

for frac in [10, 20, 30, 40]:
    for target_column in class_name:
        filename = f'lgbm.{target_column}.frac_{frac}.lz4'

        if (OUTPUT_DIR / filename).exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / f'samples.split_{n_fold}.frac_{frac}.train.pq')

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = random_forest(samples, target_column, covariates)

            model['#_fold'] = n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / filename, compress='lz4')

[LightGBM] [Info] Number of positive: 382162, number of negative: 2776846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10370
[LightGBM] [Info] Number of data points in the train set: 3159008, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120975 -> initscore=-1.983226
[LightGBM] [Info] Start training from score -1.983226
[LightGBM] [Info] Number of positive: 396559, number of negative: 2770676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10398
[LightGBM] [Info] Number of data points in the train set: 3167235, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.125207 -> initscore=-1.944022
[LightGBM] [Info] Start training from score -1.944022


## KNeighborsClassifier

In [None]:
def get_estimator(n_neighbors, metric):
    return KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric)

def knn_classifier(samples: pd.DataFrame, target_column: str, covariates: list[str], n_neighbors=3, metric='minkowski', sample_ratio=0.008):
    x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
            lambda group: group.sample(frac=sample_ratio, random_state=RANDOM_STATE)
        )[covariates]
    y_train = samples.loc[x_train.index][target_column]

    t_start = time.time()

    estimator = get_estimator(n_neighbors, metric)
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [6]:
for metric in ['minkowski', 'euclidean', 'manhattan', 'cosine']:
    for target_column in class_name:
        if (OUTPUT_DIR / f'knn.m_{metric}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = knn_classifier(samples, target_column, covariates, metric=metric)

            model['#_fold'] = n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / f'knn.m_{metric}.{target_column}.lz4', compress='lz4')

  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(

In [None]:
for n_neighbors in [1, 3]:
    for target_column in class_name:
        if (OUTPUT_DIR / f'knn.nn_{n_neighbors}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = knn_classifier(samples, target_column, covariates, n_neighbors=n_neighbors)

            model['#_fold'] = n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / f'knn.nn_{n_neighbors}.{target_column}.lz4', compress='lz4')

In [None]:
# TODO: Identify best model hyperparameters and retrain only that configuration.

# Best model: model trained with n_neighbors=3 and metric='minkowski', observed to be the best performing configuration when isolated.

for target_column in class_name:
    if (OUTPUT_DIR / f'knn.nn_{n_neighbors}.{target_column}.lz4').exists():
        continue

    models = []

    for n_fold in [f'{n:02}' for n in range(1, 6)]:
        samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

        create_ovo_class(samples, class_name, class_values)

        samples = samples[np.logical_not(np.isnan(samples[target_column]))]

        model = knn_classifier(samples, target_column, covariates, n_neighbors=3, metrics='')

        model['#_fold'] = n_fold

        models.append(model)

    joblib.dump(models, OUTPUT_DIR / f'knn.nn_{n_neighbors}.{target_column}.lz4', compress='lz4')

## SVC

Para treinamento em quantidades maiores de amostras sera importante migrar para abordagens paralelas como propostas pelo framework [cuML SVM](https://medium.com/rapids-ai/fast-support-vector-classification-with-rapids-cuml-6e49f4a7d89e).

In [5]:
def get_estimator(kernel="linear"):
    return SVC(kernel=kernel, probability=True, random_state=RANDOM_STATE)


def linear_svc(samples: pd.DataFrame, target_column: str, covariates: list[str], kernel="linear", sample_ratio=0.01):
    x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
            lambda group: group.sample(frac=sample_ratio, random_state=RANDOM_STATE)
        )[covariates]
    y_train = samples.loc[x_train.index][target_column]

    t_start = time.time()

    estimator = get_estimator(kernel)
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [None]:
# metrics = ['linear', 'poly', 'rbf']

class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

for kernel in ['linear']:
    for target_column in class_name:
        if (OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN.format(n_fold=n_fold))

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = linear_svc(samples, target_column, covariates, kernel)

            model['#_fold'] =  n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4', compress='lz4')

  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(
  x_train = samples.groupby(['tile_id', target_column], group_keys=False).apply(


# LogisticRegression

In [None]:
def get_estimator():
    return LogisticRegression(n_jobs=-1, random_state=RANDOM_STATE)


def random_forest(samples: pd.DataFrame, target_column: str, covariates: list[str]):
    x_train = samples[covariates]
    y_train = samples[target_column]

    t_start = time.time()

    estimator = get_estimator()
    estimator.fit(x_train, y_train)

    return {'model': estimator, 't_start': t_start, 't_end': time.time()}

In [None]:
class_name = ['oxc', 'oxn']
class_values = [([0], [1]), ([0], [2])]

for kernel in ['linear', 'poly', 'rbf']:
    for target_column in class_name:
        if (OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4').exists():
            continue

        models = []

        for n_fold in [f'{n:02}' for n in range(1, 6)]:
            samples = pd.read_parquet(INPUT_DIR / SAMPLES_FN)

            create_ovo_class(samples, class_name, class_values)

            samples = samples[np.logical_not(np.isnan(samples[target_column]))]

            model = linear_svc(samples, target_column, covariates, kernel)

            model['#_fold'] =  n_fold

            models.append(model)

        joblib.dump(models, OUTPUT_DIR / f'svc.k_{kernel}.{target_column}.lz4', compress='lz4')

## Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Modelo sequencial com uma única camada densa
model = models.Sequential([
    layers.Dense(512, input_shape=(64,), activation='relu'),
    layers.Dense(2, activation='softmax')
])

# Compila o modelo
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Exibe o resumo do modelo
model.summary()