In [None]:
from pathlib import Path

from fertilizer_recommender.infrastructure.repositories.model_repository_impl import JoblibModelRepository
from fertilizer_recommender.infrastructure.utils.config_loader import load_yaml_config
from fertilizer_recommender.infrastructure.repositories.dataset_repository_impl import CsvDatasetRepository
from fertilizer_recommender.domain.entities.fertilizer_features import FertilizerFeaturesSchema
from fertilizer_recommender.application.use_cases.prepare_dataset import PrepareDatasetUseCase

from fertilizer_recommender.infrastructure.ml.ensemble.probability_ensemble import ProbabilityEnsemble
from fertilizer_recommender.application.use_cases.predict_ensemble_topk import PredictEnsembleTopKUseCase
from fertilizer_recommender.domain.services.metric_service import map_at_k

cfg = load_yaml_config("../configs/training.yaml")

schema = FertilizerFeaturesSchema(
    numeric_features=[
        "Temperature","Humidity","Moisture",
        "Nitrogen","Potassium","Phosphorous",
    ],
    categorical_features=["Soil Type","Crop Type"],
)

repo = CsvDatasetRepository(
    data_dir=Path("../data/raw"),
    train_file=cfg["data"]["train_file"],
    test_file=cfg["data"]["test_file"],
)

train_df, _ = PrepareDatasetUseCase(
    repo, schema, cfg["data"]["target_col"]
).execute()

X = train_df[schema.all_features]
y = train_df[cfg["data"]["target_col"]]

model_repo = JoblibModelRepository(Path("../artifacts/models"))

pipelines = [
    model_repo.load("catboost_final"),
    model_repo.load("xgboost_final"),
    model_repo.load("lightgbm_final"),
]

ensemble = ProbabilityEnsemble(pipelines)

topk_preds = PredictEnsembleTopKUseCase(
    ensemble=ensemble,
    top_k=cfg["training"]["top_k"],
).execute(X)

score = map_at_k(y.tolist(), topk_preds, k=cfg["training"]["top_k"])
print(f"Ensemble MAP@3 (train check): {score:.5f}")