In [2]:
from pathlib import Path

from fertilizer_recommender.infrastructure.utils.config_loader import load_yaml_config
from fertilizer_recommender.infrastructure.repositories.dataset_repository_impl import CsvDatasetRepository
from fertilizer_recommender.infrastructure.repositories.model_repository_impl import JoblibModelRepository

from fertilizer_recommender.domain.entities.fertilizer_features import FertilizerFeaturesSchema
from fertilizer_recommender.application.use_cases.prepare_dataset import PrepareDatasetUseCase
from fertilizer_recommender.application.use_cases.train_final_model import TrainFinalModelUseCase
from fertilizer_recommender.application.use_cases.build_submission import BuildSubmissionUseCase

from fertilizer_recommender.infrastructure.ml.preprocessors.sklearn_transformer import SklearnFeatureTransformer
from fertilizer_recommender.infrastructure.ml.models.baseline_logreg import BaselineLogisticRegression
from fertilizer_recommender.infrastructure.ml.pipelines.training_pipeline import TrainingPipeline

cfg = load_yaml_config("../configs/training.yaml")

schema = FertilizerFeaturesSchema(
    numeric_features=[
        "Temparature", "Humidity", "Moisture",
        "Nitrogen", "Potassium", "Phosphorous",
    ],
    categorical_features=["Soil Type", "Crop Type"],
)

repo = CsvDatasetRepository(
    data_dir=Path("../data/raw"),
    train_file=cfg["data"]["train_file"],
    test_file=cfg["data"]["test_file"],
)

train_df, test_df = PrepareDatasetUseCase(
    repo, schema, cfg["data"]["target_col"]
).execute()

X_train = train_df[schema.all_features]
y_train = train_df[cfg["data"]["target_col"]]

pipeline = TrainingPipeline(
    transformer=SklearnFeatureTransformer(
        schema.numeric_features,
        schema.categorical_features,
    ),
    model=BaselineLogisticRegression(random_state=cfg["project"]["seed"]),
)

model_repo = JoblibModelRepository(
    models_dir=Path("../artifacts/models")
)

TrainFinalModelUseCase(
    pipeline=pipeline,
    model_repository=model_repo,
).execute(
    X_df=X_train,
    y=y_train,
    model_name="baseline_logreg_final",
)

submission = BuildSubmissionUseCase(
    model_repository=model_repo,
    id_col=cfg["data"]["id_col"],
    top_k=cfg["training"]["top_k"],
).execute(
    model_name="baseline_logreg_final",
    test_df=test_df[schema.all_features + [cfg["data"]["id_col"]]],
    output_path="../data/submissions/submission_baseline.csv",
)

submission.head()

Unnamed: 0,id,Fertilizer Name
0,750000,20-20 10-26-26 DAP
1,750001,10-26-26 20-20 14-35-14
2,750002,17-17-17 20-20 28-28
3,750003,14-35-14 28-28 20-20
4,750004,17-17-17 14-35-14 20-20
