In [1]:
from glob import glob
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models.signature import infer_signature
from datetime import datetime
import os
from finkvra.utils.features import make_features as fvra_make_features
from finkvra.utils.labels import cli_label_one_object as fvra_cli_label_one_object
import json
from mlflow.tracking import MlflowClient
import logging
from mlflow.models.signature import infer_signature

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.WARNING, 
                    format="%(asctime)s [%(levelname)s] %(name)s.%(funcName)s: %(message)s")

In [26]:
from finkvra.utils.env_utils import generate_conda_yaml_from_imports

PKG_DIR="/home/stevance/software/finkvra/finkvra/utils/"
dependencies_path = generate_conda_yaml_from_imports(PKG_DIR, 
                                             "/tmp/conda_from_imports.yaml", 
                                             include_self=False, extra_pip=["mlflow"])


In [12]:
from finkvra.utils.env_utils import generate_requirements_txt_from_imports

PKG_DIR="/home/stevance/software/finkvra/finkvra/utils/"
dependencies_path = generate_requirements_txt_from_imports(PKG_DIR, 
                                             "/tmp/requirements.txt", 
                                             include_self=False)


In [3]:
root_dir="/home/stevance/Science/fink-vra-notebooks/fVRA_prototype/"

experiment_ID="308259852637280151/"
run_ID="bcb101d3d0164495917b57c5d282869e/"

In [4]:
path_X = root_dir+"mlartifacts/"+experiment_ID+run_ID+"artifacts/X_train.csv"
path_y = root_dir+"mlartifacts/"+experiment_ID+run_ID+"artifacts/y_train.csv"

In [5]:
X = pd.read_csv(path_X, index_col=0)
y = pd.read_csv(path_y, index_col=0)

In [6]:
PARAMS = {'l2_regularization': 10,
          'learning_rate': 0.1,
          'random_state': 42}

In [7]:
"""Setup MLflow tracking server."""
mlflow.set_tracking_uri("https://mlflow-dev.fink-broker.org")
mlflow.set_experiment("fVRA1_OXtest")

client = MlflowClient()
#experiment = self.client.get_experiment_by_name(self.EXPERIMENT)

In [8]:
y

Unnamed: 0_level_0,label
candid,Unnamed: 1_level_1
3178177552415015010,0.0
3178186165515015016,0.0
3178185216315015014,1.0
3178189391715015017,1.0
3178190341115015020,0.0
3178189393415015025,0.0
3178191772015015003,0.0
3178191295515015003,0.0
3178191773615015032,0.0
3178189870415015017,1.0


In [9]:
dat = pd.read_parquet( "../../../Data/fVRA1/parquet/20251008_133001_alerts.parquet")

In [10]:
dat[:10].to_parquet("./raw.parquet")

In [11]:
with mlflow.start_run(run_name=f"test"):
    
    # Log metadata
    meta_info = {
        "timestamp": datetime.utcnow().isoformat(),
        "n_train": int(X.shape[0]),
        "sampling_strategy": str('bla'),
        "model_tag": str('toto')
    }

    with open("meta.json", "w") as f:
        json.dump(meta_info, f, indent=2)
    mlflow.log_artifact("meta.json")

    # Train model
    mlflow.log_params(PARAMS)
    clf_new = HistGradientBoostingClassifier(**PARAMS)
    clf_new.fit(X.values, y.values)
    y_pred_new = clf_new.predict(X.values)

    # Evaluate on training set
    acc = accuracy_score(y, y_pred_new)
    mlflow.log_metric("accuracy", acc)

    prec = precision_score(y, y_pred_new)
    mlflow.log_metric("precision", prec)

    recall = recall_score(y, y_pred_new)
    mlflow.log_metric("recall", recall)

    f1 = f1_score(y, y_pred_new)
    mlflow.log_metric("f1-score", f1)

    # Log model
    signature = infer_signature(X, y_pred_new)
    mlflow.sklearn.log_model(
        clf_new,
        name="testOx",
        #artifact_path=self.model_subpath, #deprecated
        signature=signature,
        #code_paths=[PKG_DIR],
        input_example=X.iloc[:2],
        #conda_env=yaml_path
        )

    #mlflow.log_artifact(yaml_path, 
    #                    artifact_path="env")

    # Save training state
    mlflow.log_artifact(dependencies_path)
    mlflow.log_artifact('./raw.parquet')
    #mlflow.log_artifacts(PKG_DIR, artifact_path="code")
    mlflow.log_artifact(os.path.join(PKG_DIR, 'features.py'), artifact_path="code")
    #mlflow.log_artifact(self.training_ids_path)
    mlflow.log_table(X, "X_train.parquet")
    mlflow.log_table(y, "y_train.parquet")


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2025/10/08 16:52:25 INFO mlflow.models.model: Found the following environment variables used during model inference: [OPENAI_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run test at: https://mlflow-dev.fink-broker.org/#/experiments/361852637698994883/runs/79e6fbfb2b5343f68dd141577346aa73
🧪 View experiment at: https://mlflow-dev.fink-broker.org/#/experiments/361852637698994883


In [44]:
os.path.join(PKG_DIR, 'features.py')

'/home/stevance/software/finkvra/finkvra/utils/features.py'