# Tools

In [9]:
from pathlib import Path

MODELS_DIR = Path("../models")
SAMPLES_DIR = Path("../data/sample_splits")
OUTPUT_DIR = Path("../results")
OUTPUT_DIR.mkdir(exist_ok=True)

In [2]:
import time
import joblib
import numpy as np
import pandas as pd
import pyarrow.dataset as ds

from joblib import Parallel, delayed

from sklearn.metrics import precision_score, recall_score, f1_score, r2_score, log_loss, precision_recall_curve

In [3]:
CLASS_NAMES = ['oxc', 'oxn']

COVARIATES = [f'B{i:02}' for i in range(1, 65)]

In [11]:
def load_samples(itter: int, class_name: str) -> pd.DataFrame:
    target_value = {'oxc': 1, 'oxn': 2, 'oxs': 3}[class_name]

    dataset = ds.dataset(SAMPLES_DIR / f"samples.split_{itter:02}.test.pq", format="parquet")
    table = dataset.to_table(filter=ds.field('class').isin(range(0, target_value + 1, target_value)))

    samples = table.to_pandas()

    samples['class'] = samples['class'].apply(lambda x: x // target_value)
    
    return samples

In [5]:
def get_optimal_threshold(y, y_hat_prob):
    precision, recall, threshold = precision_recall_curve(y, y_hat_prob)
    
    nonzero_mask = np.logical_and((precision != 0.0), (recall != 0.0))
    
    optimal_idx = np.argmax(1 - np.abs(precision[nonzero_mask] - recall[nonzero_mask]))
    
    return threshold[optimal_idx]

In [6]:
N_JOBS = 20

def predict_proba_batch(model, X_batch):
    s_time = time.time()

    y = X_batch['class']
    y_hat_proba = model.predict_proba(X_batch[COVARIATES])[:,1]

    return {
        'y': y,
        'y_hat_proba': y_hat_proba,
        'prediction_time': time.time() - s_time,
        'batch_size': len(y)
    }

def predict_proba_parallel(model, X_valid):
    batch_size = len(X_valid) // N_JOBS

    results = Parallel(n_jobs=N_JOBS)(
        delayed(predict_proba_batch)(model, X_valid[i:i+batch_size])
        for i in range(0, len(X_valid), batch_size)
    )

    y = np.concatenate([result['y'] for result in results])
    y_hat_proba = np.concatenate([result['y_hat_proba'] for result in results])
    prediction_time = [result['prediction_time'] for result in results]
    batch_size = [result['batch_size'] for result in results]

    threshold = get_optimal_threshold(y, y_hat_proba)

    y_hat = (np.array(y_hat_proba) >= threshold).astype(int)
    
    return {
        'threshold': threshold,
        'f1_score': f1_score(y, y_hat),
        'recall_score': recall_score(y, y_hat),
        'precision_score': precision_score(y, y_hat),
        'log_loss_score': log_loss(y, y_hat),
        'r2_score': r2_score(y, y_hat),
        'prediction_time': np.mean([t / b for t, b in zip(prediction_time, batch_size)])
    }

In [7]:
N_JOBS = 20

def predict_batch(model, X_batch):
    s_time = time.time()

    y = X_batch['class']
    y_hat = model.predict(X_batch[COVARIATES])

    return {
        'y': y,
        'y_hat': y_hat,
        'prediction_time': time.time() - s_time,
        'batch_size': len(y)
    }

def predict_parallel(model, X_valid):
    batch_size = len(X_valid) // N_JOBS

    results = Parallel(n_jobs=N_JOBS)(
        delayed(predict_batch)(model, X_valid[i:i+batch_size])
        for i in range(0, len(X_valid), batch_size)
    )

    y = np.concatenate([result['y'] for result in results])
    y_hat = np.concatenate([result['y_hat'] for result in results])
    prediction_time = [result['prediction_time'] for result in results]
    batch_size = [result['batch_size'] for result in results]
    
    return {
        'f1_score': f1_score(y, y_hat),
        'recall_score': recall_score(y, y_hat),
        'precision_score': precision_score(y, y_hat),
        'prediction_time': np.mean([t / b for t, b in zip(prediction_time, batch_size)])
    }

## RandomForest

In [10]:
for frac in [10, 20, 30, 40]:
    for class_name in CLASS_NAMES:
        filename = f"rf.frac_{frac}.{class_name}.lz4"

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        results = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name)

            result = predict_proba_parallel(model, samples)
            result['training_time'] = model_dict[itter]['training_time']

            results.append(result)

        joblib.dump({
            'threshold': np.mean([result['threshold'] for result in results]),
            'f1_score': np.mean([result['f1_score'] for result in results]),
            'recall_score': np.mean([result['recall_score'] for result in results]),
            'precision_score': np.mean([result['precision_score'] for result in results]),
            'training_time': np.mean([result['training_time'] for result in results]),
            'prediction_time': np.mean([result['prediction_time'] for result in results]),
            'results': results
        }, OUTPUT_DIR / filename)



## XGBoost

In [None]:
for class_name in CLASS_NAMES:
    filename = f"xgb.{class_name}.lz4"

    model_path = MODELS_DIR / filename
    model_dict = joblib.load(model_path)

    results = []

    for itter in range(5):
        model = model_dict[itter]['model']

        samples = load_samples(itter+1, class_name)

        results.append(predict_proba_parallel(model, samples))

    joblib.dump({
        'threshold': np.mean([result['threshold'] for result in results]),
        'f1_score': np.mean([result['f1_score'] for result in results]),
        'recall_score': np.mean([result['recall_score'] for result in results]),
        'precision_score': np.mean([result['precision_score'] for result in results]),
        'prediction_time': np.mean([result['prediction_time'] for result in results]),
        'results': results
    }, OUTPUT_DIR / filename)

## LightGBM

In [12]:
for class_name in CLASS_NAMES:
    filename = f"lgbm.{class_name}.lz4"

    model_path = MODELS_DIR / filename
    model_dict = joblib.load(model_path)

    results = []

    for itter in range(5):
        model = model_dict[itter]['model']

        samples = load_samples(itter+1, class_name)

        results.append(predict_proba_parallel(model, samples))

    joblib.dump({
        'threshold': np.mean([result['threshold'] for result in results]),
        'f1_score': np.mean([result['f1_score'] for result in results]),
        'recall_score': np.mean([result['recall_score'] for result in results]),
        'precision_score': np.mean([result['precision_score'] for result in results]),
        'log_loss_score': np.mean([result['log_loss_score'] for result in results]),
        'r2_score': np.mean([result['r2_score'] for result in results]),
        'prediction_time': np.mean([result['prediction_time'] for result in results]),
        'results': results
    }, OUTPUT_DIR / filename)

## KNeighborsClassifier

In [None]:
for class_name in CLASS_NAMES:
    for metric in ['minkowski', 'euclidean', 'manhattan', 'cosine']:
        filename = f"knn.m_{metric}.{class_name}.lz4"

        if (OUTPUT_DIR / filename).exists():
            continue

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        results = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name)

            results.append(predict_parallel(model, samples))

        joblib.dump({
            'threshold': None,
            'f1_score': np.mean([result['f1_score'] for result in results]),
            'recall_score': np.mean([result['recall_score'] for result in results]),
            'precision_score': np.mean([result['precision_score'] for result in results]),
            'prediction_time': np.mean([result['prediction_time'] for result in results]),
            'results': results
        }, OUTPUT_DIR / filename)

In [None]:
for class_name in CLASS_NAMES:
    for n_neighbors in [1, 3]:
        filename = f"knn.nn_{n_neighbors}.{class_name}.lz4"

        if (OUTPUT_DIR / filename).exists():
            continue

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        results = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name)

            results.append(predict_parallel(model, samples))

        joblib.dump({
            'threshold': None,
            'f1_score': np.mean([result['f1_score'] for result in results]),
            'recall_score': np.mean([result['recall_score'] for result in results]),
            'precision_score': np.mean([result['precision_score'] for result in results]),
            'prediction_time': np.mean([result['prediction_time'] for result in results]),
            'results': results
        }, OUTPUT_DIR / filename)

## SVC

In [None]:
for class_name in CLASS_NAMES:
    for kernel in ['linear', 'poly', 'rbf']:
        filename = f"svc.k_{kernel}.{class_name}.lz4"

        model_path = MODELS_DIR / filename
        model_dict = joblib.load(model_path)

        results = []

        for itter in range(5):
            model = model_dict[itter]['model']

            samples = load_samples(itter+1, class_name)

            results.append(predict_proba_parallel(model, samples))

        joblib.dump({
            'threshold': np.mean([result['threshold'] for result in results]),
            'f1_score': np.mean([result['f1_score'] for result in results]),
            'recall_score': np.mean([result['recall_score'] for result in results]),
            'precision_score': np.mean([result['precision_score'] for result in results]),
            'prediction_time': np.mean([result['prediction_time'] for result in results]),
            'results': results
        }, OUTPUT_DIR / filename)