In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import warnings
from pathlib import Path
from typing import Any

from sklearn.metrics import accuracy_score, f1_score, fbeta_score, recall_score
import pandas as pd
import numpy as np

from websearchclassifier.config import (
    DatasetConfig,
    FastTextConfig,
    TFIDFConfig,
    HerBERTConfig,
    CrossValidationEvaluatorConfig,
    LogisticRegressionConfig,
    SVMConfig,
    MLPConfig,
    WebSearchClassifierConfig,
)
from websearchclassifier.dataset import Labels, Predictions
from websearchclassifier.evaluation import CrossValidationEvaluator
from websearchclassifier.model import WebSearchClassifier
from websearchclassifier.pipeline import Pipeline
from websearchclassifier.utils import logger

In [None]:
warnings.filterwarnings("ignore")

# Training Pipeline Example

This notebook demonstrates how to use the Pipeline class to train and evaluate web search classifiers.

## 1. Initialize Pipeline

Create a pipeline instance with `DatasetConfig`.

In [None]:
dataset_config = DatasetConfig(
    dataset_path=Path("data/train.csv"),
    prompt_column="prompt",
    label_column="search"
)

pipeline = Pipeline(dataset_config=dataset_config)
dataset = pipeline.load_data()

## 2. Prepare Configuration

Create configuration for various:
* baseline models (`tfidf`, `fasttext`, `herbert`)
* binary classifiers (`logistic_regression`, `mlp` or `svm`)

In [None]:
DEVICE = "auto"
BATCH_SIZE = 64
RANDOM_STATE = 137

logistic_regression_config = LogisticRegressionConfig(
    random_state=RANDOM_STATE,
    regularization_strength=1.0,
)

mlp_config = MLPConfig(
    random_state=RANDOM_STATE,
    hidden_layer_sizes=(128, 64),
    activation="tanh",
    learning_rate=0.001,
    max_iterations=200,
    batch_size=BATCH_SIZE,
    dropout_rate=0.3,
    early_stopping=True,
    validation_fraction=0.1,
)

svm_config = SVMConfig(
    random_state=RANDOM_STATE,
    regularization_strength=1.0,
    probability=True,
    kernel="rbf",
)

In [None]:
tfidf_config = TFIDFConfig(
    max_features=5000,
    ngram_range=(1, 3),
    min_document_frequency=2,
    max_document_frequency=0.95,
)

fasttext_config = FastTextConfig(
    embedding_dim=300,
    embeddings_path=Path("cc.pl.300.bin"),
)

herbert_config = HerBERTConfig(
    model_name="allegro/herbert-base-cased",
    batch_size=BATCH_SIZE,
    device=DEVICE,    
)

## 3. Train TF-IDF Model

Train a TF-IDF based classifier using config object.

In [None]:
tfidf_logistic_regression_config = WebSearchClassifierConfig(
    baseline=tfidf_config,
    classifier=logistic_regression_config,
)

tfidf_logistic_regression = pipeline.train_and_save(
    config=tfidf_logistic_regression_config,
    output_path=Path("models/tfidf_classifier.pkl"),
)

## 4. Test TF-IDF Model

Test the trained model on example prompts.

In [None]:
pipeline.test_predictions(tfidf_logistic_regression)

## 5. Train FastText Model

Train a FastText-based classifier using config object (requires pre-downloaded embeddings).

In [None]:
fasttext_svm_config = WebSearchClassifierConfig(
    baseline=fasttext_config,
    classifier=mlp_config,
)


In [None]:
fasttext_svm = pipeline.train_and_save(
    config=fasttext_svm_config,
    output_path=Path("models/fasttext_classifier.pkl")
)

In [None]:
pipeline.test_predictions(fasttext_svm)

## 6. Use Pipeline Object with YAML File

Load configuration from YAML file and create config object, using `Pipeline` object.

In [None]:
tfidf_config_from_yaml, output_path = Pipeline.load_config(
    baseline="tfidf",
    classifier="mlp",
    config_path=Path("config.yaml")
)

model_from_config = pipeline.train_and_save(
    config=tfidf_config_from_yaml,
    output_path=output_path
)

## 7. HerBERT Model

In [None]:
herbert_mlp_config = WebSearchClassifierConfig(
    baseline=herbert_config,
    classifier=mlp_config,
)

herbert_mlp = pipeline.train_and_save(
    config=herbert_mlp_config,
    output_path=Path("models/herbert_classifier.pkl")
)

In [None]:
pipeline.test_predictions(herbert_mlp)

## 8. Load Saved Model

Load a previously trained model from disk.

In [None]:
loaded_tfidf = WebSearchClassifier.load("models/tfidf_classifier.pkl")
loaded_herbert = WebSearchClassifier.load("models/herbert_classifier.pkl")

result = loaded_tfidf.predict("aktualna cena bitcoina")
logger.info(f"Loaded TF-IDF prediction: {result[0]}")

result = loaded_herbert.predict("aktualna cena bitcoina")
logger.info(f"Loaded HerBERT prediction: {result[0]}")

## 9. Cross-Validation Evaluation

Evaluate model performance using cross-validation.

In [None]:
evaluator_config = CrossValidationEvaluatorConfig(
    dataset_config=dataset_config,
    folds=5,
    stratify=True,
    random_seed=42
)

evaluator = CrossValidationEvaluator(config=evaluator_config)

In [None]:
def accuracy_metric(labels: Labels, predictions: Predictions) -> float:
    predictions = (predictions[:, 1] > 0.5).astype(int)
    return float(accuracy_score(labels, predictions))

def f1_metric(labels: Labels, predictions: Predictions) -> float:
    predictions = (predictions[:, 1] > 0.5).astype(int)
    return float(f1_score(labels, predictions))

def recall_metric(labels: Labels, predictions: Predictions) -> float:
    predictions = (predictions[:, 1] > 0.5).astype(int)
    return float(recall_score(labels, predictions))

def f2_metric(labels: Labels, predictions: Predictions) -> float:
    predictions = (predictions[:, 1] > 0.5).astype(int)
    return float(fbeta_score(labels, predictions, beta=2.0))

In [None]:
try:
    logger.setLevel(logging.WARNING)
    
    accuracy = evaluator(tfidf_logistic_regression, dataset, accuracy_metric)
    f1 = evaluator(tfidf_logistic_regression, dataset, f1_metric)
    recall = evaluator(tfidf_logistic_regression, dataset, recall_metric)
    f2 = evaluator(tfidf_logistic_regression, dataset, f2_metric)
finally:
    logger.setLevel(logging.INFO)
    logger.info(
        "TF-IDF Model Evaluation (5-fold CV):\n"
        f"  Accuracy: {accuracy:.4f}\n"
        f"  F1 Score: {f1:.4f}\n"
        f"  Recall:   {recall:.4f}\n"
        f"  F2 Score: {f2:.4f}\n"
    )

## 11. Compare Models

Compare TF-IDF, FastText and HerBERT models using cross-validation.

In [None]:
models = {
    "TFIDF": (tfidf_logistic_regression, {}),
    "fasttext": (fasttext_svm, {"embeddings_path": fasttext_config.embeddings_path}),
    "herbert": (herbert_mlp, {}),
}

try:
    logger.setLevel(logging.WARNING)
    
    logger.info("Model Comparison (5-fold Cross-Validation):")
    logger.info("=" * 50)

    results = {}
    for name, (model, init_kwargs) in models.items():
        accuracy = evaluator(model, dataset, accuracy_metric, **init_kwargs)
        f1 = evaluator(model, dataset, f1_metric, **init_kwargs)
        recall = evaluator(model, dataset, recall_metric, **init_kwargs)
        f2 = evaluator(model, dataset, f2_metric, **init_kwargs)
        
        results[name] = {
            "accuracy": accuracy,
            "f1": f1,
            "recall": recall,
            "f2": f2
        }
        
        logger.info(f"\n{name}:")
        logger.info(f"  Accuracy: {accuracy:.4f}")
        logger.info(f"  F1 Score: {f1:.4f}")
        logger.info(f"  Recall:   {recall:.4f}")
        logger.info(f"  F2 Score: {f2:.4f}")
finally:
    logger.setLevel(logging.INFO)

In [None]:
logger.info("\n" + "=" * 50)

for model_name in ["TFIDF", "fasttext", "herbert"]:
    if model_name in results:
        model_results = results[model_name]
        logger.info(
            f"{model_name}:\n"
            f"Accuracy: {model_results['accuracy']:.4f}\n"
            f"Recall:   {model_results['recall']:.4f}\n"    
            f"F1 Score: {model_results['f1']:.4f}\n"
            f"F2 Score: {model_results['f2']:.4f}\n"
        )