# Tools

In [1]:
from pathlib import Path

MODELS_DIR = Path("../models")
SAMPLES_DIR = Path("../data")
OUTPUT_DIR = Path("../results")
OUTPUT_DIR.mkdir(exist_ok=True)

In [2]:
import time
import joblib
import numpy as np
import pandas as pd
import pyarrow.dataset as ds

from joblib import Parallel, delayed

from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [3]:
CLASS_NAMES = ['oxc', 'oxn']

COVARIATES = [f'B{i:02}' for i in range(1, 65)]

In [4]:
def load_samples(itter: int, class_name: str) -> pd.DataFrame:
    target_value = {'oxc': 1, 'oxn': 2, 'oxs': 3}[class_name]

    dataset = ds.dataset(Path('../data') / f"{itter:02}_samples_test.pq", format="parquet")
    table = dataset.to_table(filter=ds.field('class').isin(range(0, target_value + 1, target_value)))

    samples = table.to_pandas()

    samples['class'] = samples['class'].apply(lambda x: x // target_value)
    
    return samples

In [5]:
def get_optimal_threshold(y, y_hat_prob):
    precision, recall, threshold = precision_recall_curve(y, y_hat_prob)
    
    nonzero_mask = np.logical_and((precision != 0.0), (recall != 0.0))
    
    optimal_idx = np.argmax(1 - np.abs(precision[nonzero_mask] - recall[nonzero_mask]))
    
    return threshold[optimal_idx]

In [None]:
N_JOBS = 10

def predict_proba_batch(model, X_batch):
    s_time = time.time()

    y = X_batch['class']
    y_hat_proba = model.predict_proba(X_batch[COVARIATES])[:,1]

    return {
        'y': y,
        'y_hat_proba': y_hat_proba,
        'prediction_delay': time.time() - s_time,
        'batch_size': len(y)
    }

def predict_proba_parallel(model, X_valid):
    batch_size = len(X_valid) // N_JOBS

    results = Parallel(n_jobs=N_JOBS)(
        delayed(predict_proba_batch)(model, X_valid[i:i+batch_size])
        for i in range(0, len(X_valid), batch_size)
    )

    y = np.concatenate([result['y'] for result in results])
    y_hat_proba = np.concatenate([result['y_hat_proba'] for result in results])
    prediction_delay = [result['prediction_delay'] for result in results]
    batch_size = [result['batch_size'] for result in results]

    threshold = get_optimal_threshold(y, y_hat_proba)

    y_hat = (np.array(y_hat_proba) >= threshold).astype(int)
    
    return {
        'threshold': threshold,
        'f1_score': f1_score(y, y_hat),
        'recall_score': recall_score(y, y_hat),
        'precision_score': precision_score(y, y_hat),
        'prediction_delay': np.mean([t / b for t, b in zip(prediction_delay, batch_size)])
    }

## RandomForest

In [None]:
for class_name in CLASS_NAMES:
    filename = f"rf.{class_name}.lz4"

    model_path = MODELS_DIR / filename
    model_dict = joblib.load(model_path)

    results = []

    for itter in range(5):
        model = model_dict[itter]['model']

        samples = load_samples(itter+1, class_name)

        results.append(predict_proba_parallel(model, samples))

    joblib.dump({
        'threshold': np.mean([result['threshold'] for result in results]),
        'f1_score': np.mean([result['f1_score'] for result in results]),
        'recall_score': np.mean([result['recall_score'] for result in results]),
        'precision_score': np.mean([result['precision_score'] for result in results]),
        'prediction_delay': np.mean([result['prediction_delay'] for result in results]),
        'results': results
    }, OUTPUT_DIR / filename)

Exception ignored in: <_io.BufferedReader>
Traceback (most recent call last):
  File "c:\Users\Tiago\anaconda3\envs\gdal\Lib\site-packages\lz4\frame\__init__.py", line 753, in flush
    self._fp.flush()
ValueError: I/O operation on closed file.
Exception ignored in: <_io.BufferedReader>
Traceback (most recent call last):
  File "c:\Users\Tiago\anaconda3\envs\gdal\Lib\site-packages\lz4\frame\__init__.py", line 753, in flush
    self._fp.flush()
ValueError: I/O operation on closed file.


## KNeighborsClassifier

In [None]:
for class_name in CLASS_NAMES:
    for metric in ['minkowski', 'euclidean', 'manhattan', 'cosine']:
        filename = f"knn.m_{metric}.{class_name}.lz4"

        if (OUTPUT_DIR / filename).exists():
            continue

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        y = []
        y_hat = []
        time_records = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name).sample(frac=0.1)

            s_time = time.time()

            model.predict(samples[COVARIATES])

            time_records.append(time.time() - s_time)
            y.extend(samples['class'].to_list())
            y_hat.extend(model.predict(samples[COVARIATES]))

            print(time.time() - s_time, "Terminado - ", itter)

        joblib.dump({
            'threshold': None,
            'f1_score': f1_score(y, y_hat),
            'recall_score': recall_score(y, y_hat),
            'precision_score': precision_score(y, y_hat),
            'time_records': time_records
        }, OUTPUT_DIR / filename)

        break

In [None]:
for class_name in CLASS_NAMES:
    for n_neighbors in [1, 3]:
        filename = f"knn.nn_{n_neighbors}.{class_name}.lz4"

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        y = []
        y_hat = []
        time_records = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name)

            s_time = time.time()

            model.predict(samples[COVARIATES])

            time_records.append(time.time() - s_time)
            y.extend(samples['class'].to_list())
            y_hat.extend(model.predict(samples[COVARIATES]))

        joblib.dump({
            'threshold': None,
            'f1_score': f1_score(y, y_hat),
            'recall_score': recall_score(y, y_hat),
            'precision_score': precision_score(y, y_hat),
            'time_records': time_records
        }, OUTPUT_DIR / filename)

## SVC

In [None]:
for class_name in CLASS_NAMES:
    for kernel in ['linear', 'poly', 'rbf']:
        filename = f"svc.k_{kernel}.{class_name}.lz4"

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        results = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name)

            results.append(predict_proba_parallel(model, samples))

        joblib.dump({
            'threshold': np.mean([result['threshold'] for result in results]),
            'f1_score': np.mean([result['f1_score'] for result in results]),
            'recall_score': np.mean([result['recall_score'] for result in results]),
            'precision_score': np.mean([result['precision_score'] for result in results]),
            'prediction_delay': np.mean([result['prediction_delay'] for result in results]),
            'results': results
        }, OUTPUT_DIR / filename)