# Tools

In [1]:
from pathlib import Path

MODELS_DIR = Path("../models")
SAMPLES_DIR = Path("../data")
OUTPUT_DIR = Path("../results")
OUTPUT_DIR.mkdir(exist_ok=True)

In [2]:
import time
import joblib
import numpy as np
import pandas as pd
import pyarrow.dataset as ds

from pathlib import Path

from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [3]:
def load_samples(itter: int, class_name: str) -> pd.DataFrame:
    target_value = {'oxc': 1, 'oxn': 2, 'oxs': 3}[class_name]

    dataset = ds.dataset(Path('../data') / f"{itter:02}_samples_test.pq", format="parquet")
    table = dataset.to_table(filter=ds.field('class').isin(range(0, target_value + 1, target_value)))

    samples = table.to_pandas()

    samples['class'] = samples['class'].apply(lambda x: x // target_value)
    
    return samples

In [4]:
def get_optimal_threshold(y, y_hat_prob):
    precision, recall, threshold = precision_recall_curve(y, y_hat_prob)
    
    nonzero_mask = np.logical_and((precision != 0.0), (recall != 0.0))
    
    optimal_idx = np.argmax(1 - np.abs(precision[nonzero_mask] - recall[nonzero_mask]))
    
    return threshold[optimal_idx]

# Predições

In [5]:
CLASS_NAMES = ['oxc', 'oxn']

COVARIATES = [f'B{i:02}' for i in range(1, 65)]

## RandomForest

In [None]:
for class_name in CLASS_NAMES:
    model_path = MODELS_DIR / f"rf.{class_name}.lz4"
    model_dict = joblib.load(model_path)

    y = []
    y_hat_prob = []
    time_records = []

    for itter in range(5):
        model = model_dict[itter]['model']

        samples = load_samples(itter+1, class_name)

        s_time = time.time()

        model.predict_proba(samples[COVARIATES])

        time_records.append(time.time() - s_time)
        y.extend(samples['class'].to_list())
        y_hat_prob.extend(model.predict_proba(samples[COVARIATES])[:,1])

    threshold = get_optimal_threshold(y, y_hat_prob)

    y_hat = (np.array(y_hat_prob) >= threshold).astype(int)

    joblib.dump({
        'threshold': threshold,
        'f1_score': f1_score(y, y_hat),
        'recall_score': recall_score(y, y_hat),
        'precision_score': precision_score(y, y_hat),
        'time_records': time_records
    }, OUTPUT_DIR / f"rf.{class_name}.lz4")

Exception ignored in: <_io.BufferedReader>
Traceback (most recent call last):
  File "c:\Users\Tiago\anaconda3\envs\gdal\Lib\site-packages\lz4\frame\__init__.py", line 753, in flush
    self._fp.flush()
ValueError: I/O operation on closed file.
Exception ignored in: <_io.BufferedReader>
Traceback (most recent call last):
  File "c:\Users\Tiago\anaconda3\envs\gdal\Lib\site-packages\lz4\frame\__init__.py", line 753, in flush
    self._fp.flush()
ValueError: I/O operation on closed file.


## KNeighborsClassifier

In [None]:
for class_name in CLASS_NAMES:
    for metric in ['minkowski', 'euclidean', 'manhattan', 'cosine']:
        filename = f"knn.m_{metric}.{class_name}.lz4"

        if (OUTPUT_DIR / filename).exists():
            continue

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        y = []
        y_hat = []
        time_records = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name).sample(frac=0.1)

            s_time = time.time()

            model.predict(samples[COVARIATES])

            time_records.append(time.time() - s_time)
            y.extend(samples['class'].to_list())
            y_hat.extend(model.predict(samples[COVARIATES]))

            print(time.time() - s_time, "Terminado - ", itter)

        joblib.dump({
            'threshold': None,
            'f1_score': f1_score(y, y_hat),
            'recall_score': recall_score(y, y_hat),
            'precision_score': precision_score(y, y_hat),
            'time_records': time_records
        }, OUTPUT_DIR / filename)

        break

In [None]:
for class_name in CLASS_NAMES:
    for n_neighbors in [1, 3]:
        filename = f"knn.nn_{n_neighbors}.{class_name}.lz4"

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        y = []
        y_hat = []
        time_records = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name)

            s_time = time.time()

            model.predict(samples[COVARIATES])

            time_records.append(time.time() - s_time)
            y.extend(samples['class'].to_list())
            y_hat.extend(model.predict(samples[COVARIATES]))

        joblib.dump({
            'threshold': None,
            'f1_score': f1_score(y, y_hat),
            'recall_score': recall_score(y, y_hat),
            'precision_score': precision_score(y, y_hat),
            'time_records': time_records
        }, OUTPUT_DIR / filename)

## SVC

In [None]:
for class_name in CLASS_NAMES:
    for kernel in ['linear', 'poly', 'rbf']:
        filename = f"svc.k_{kernel}.{class_name}.lz4"

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        y = []
        y_hat_prob = []
        time_records = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name)

            s_time = time.time()

            model.predict_proba(samples[COVARIATES])

            time_records.append(time.time() - s_time)
            y.extend(samples['class'].to_list())
            y_hat_prob.extend(model.predict_proba(samples[COVARIATES])[:,1])

        threshold = get_optimal_threshold(y, y_hat_prob)

        y_hat = (np.array(y_hat_prob) >= threshold).astype(int)

        joblib.dump({
            'threshold': threshold,
            'f1_score': f1_score(y, y_hat),
            'recall_score': recall_score(y, y_hat),
            'precision_score': precision_score(y, y_hat),
            'time_records': time_records
        }, OUTPUT_DIR / filename)