In [1]:
#!/usr/bin/env python3
"""
Data acquisition script for Credit Card Fraud Detection MLOps Pipeline.

This script:
1. Downloads the Credit Card Fraud Detection dataset
2. Initializes DVC
3. Adds the raw data to DVC tracking
4. Pushes to the DVC remote
"""

import os
import sys
import logging
import requests
import hashlib
import subprocess
from pathlib import Path

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger('data-acquisition')

# Constants
DATA_URL = "https://nextcloud.scopicsoftware.com/s/bo5PTKgpngWymGE/download/creditcard-data.csv"
DATA_DIR = Path("data")
RAW_DATA_DIR = DATA_DIR / "raw"
RAW_DATA_FILE = RAW_DATA_DIR / "creditcard-data.csv"
GIT_IGNORE = "data/raw/.gitignore"
# Expected SHA256 checksum of the file (optional for validation)
EXPECTED_SHA256 = None  # Replace with actual SHA256 if known

def setup_directories():
    """Create necessary directories if they don't exist."""
    logger.info(f"Creating directory {RAW_DATA_DIR}")
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True) 

def download_data():
    """Download the dataset from the source URL."""
    if RAW_DATA_FILE.exists():
        logger.info(f"Data file already exists at {RAW_DATA_FILE}, skipping download.")
        return

    logger.info(f"Downloading data from {DATA_URL}")
    response = requests.get(DATA_URL, stream=True)
    response.raise_for_status()

    with open(RAW_DATA_FILE, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    logger.info(f"Download complete: {RAW_DATA_FILE}")

def compute_sha256(filepath):
    """Compute SHA256 checksum of a file."""
    sha256 = hashlib.sha256()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            sha256.update(chunk)
    return sha256.hexdigest()

def validate_data():
    """Validate the downloaded data file integrity."""
    if not RAW_DATA_FILE.exists():
        logger.error("Data file does not exist.")
        sys.exit(1)

    if EXPECTED_SHA256:
        logger.info("Validating data file checksum...")
        checksum = compute_sha256(RAW_DATA_FILE)
        if checksum != EXPECTED_SHA256:
            logger.error("Checksum does not match. File may be corrupted.")
            sys.exit(1)
        logger.info("Checksum validated.")
    else:
        logger.warning("No expected checksum provided. Skipping validation.")

def has_staged_changes():
    result = subprocess.run(
        ["git", "diff", "--cached", "--quiet"]
    )
    return result.returncode != 0  # True if there are staged changes

def initialize_dvc():
    """Initialize DVC and add data to tracking."""
    if not Path(".dvc").exists():
        logger.info("Initializing DVC...")
        subprocess.run(["dvc", "init"], check=True)

    logger.info(f"Adding {RAW_DATA_FILE} to DVC tracking...")
    subprocess.run(["dvc", "add", str(RAW_DATA_FILE)], check=True)

    logger.info("Committing DVC changes to Git...")
    subprocess.run(["git", "add", str(RAW_DATA_FILE) + ".dvc"], check=True)
    subprocess.run(["git", "add", GIT_IGNORE], check=True)
    
    if has_staged_changes():
        subprocess.run(["git", "commit", "-m", "Add raw dataset to DVC"], check=True)
    else:
        print("Nothing to commit — working tree clean.")

    logger.info("Pushing data to DVC remote...")
    subprocess.run(["dvc", "push"], check=True)

def main():
    """Main function to orchestrate the data acquisition process."""
    logger.info("Starting data acquisition process")

    setup_directories()
    download_data()
    validate_data()
    initialize_dvc()

    logger.info("Data acquisition completed successfully")

if __name__ == "__main__":
    main()

2025-05-16 14:21:35,910 - data-acquisition - INFO - Starting data acquisition process
2025-05-16 14:21:35,910 - data-acquisition - INFO - Creating directory data\raw
2025-05-16 14:21:35,913 - data-acquisition - INFO - Data file already exists at data\raw\creditcard-data.csv, skipping download.
2025-05-16 14:21:35,915 - data-acquisition - INFO - Adding data\raw\creditcard-data.csv to DVC tracking...
2025-05-16 14:21:39,623 - data-acquisition - INFO - Committing DVC changes to Git...
Nothing to commit — working tree clean.
2025-05-16 14:21:39,831 - data-acquisition - INFO - Pushing data to DVC remote...
2025-05-16 14:21:42,896 - data-acquisition - INFO - Data acquisition completed successfully


In [None]:
#!/usr/bin/env python3
"""
Data preprocessing script for Credit Card Fraud Detection MLOps Pipeline.

This script:
1. Loads a specific version of raw data from DVC
2. Splits data into train/validation/test sets
3. Normalizes features
4. Handles class imbalance
5. Saves processed datasets back to DVC
6. Logs preprocessing steps to MLflow

Usage:
    python preprocess.py --data-rev <DVC_REVISION>
"""

import os
import sys
import time
import logging
import argparse
import subprocess
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import mlflow

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger('data-preprocessing')

# Constants
DATA_DIR = Path("data")
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
RAW_DATA_FILE = RAW_DATA_DIR / "creditcard-data.csv"

def parse_args():
    logger.info(f"Check parser")
    parser = argparse.ArgumentParser(description='Data preprocessing script')
    logger.info(f"Check parser 1")
    parser.add_argument('--data-rev', type=str, required=False, default="HEAD", help='(Optional) DVC revision/version of the raw data to use. Defaults to HEAD.')
    logger.info(f"Check parser 2")
    return parser.parse_known_args()[0]

def setup_directories():
    logger.info(f"Creating processed data directory: {PROCESSED_DATA_DIR}")
    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

def setup_mlflow():
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("Preprocessing")

def load_data(data_rev):
    logger.info(f"Checking out raw data at DVC revision: {data_rev}")
    # subprocess.run(["dvc", "checkout", RAW_DATA_FILE.as_posix(), "--rev", data_rev], check=True)
    subprocess.run(["git", "checkout", data_rev], check=True)
    subprocess.run(["dvc", "pull", RAW_DATA_FILE.as_posix()], check=True)
    logger.info(f"Loading dataset from {RAW_DATA_FILE}")
    return pd.read_csv(RAW_DATA_FILE)

def analyze_data(df):
    stats = {
        "num_rows": len(df),
        "num_features": df.shape[1],
        "num_fraud": df[df["Class"] == 1].shape[0],
        "num_normal": df[df["Class"] == 0].shape[0],
    }
    mlflow.log_metrics(stats)
    logger.info(f"Data summary: {stats}")
    return stats

def preprocess_data(df):
    logger.info("Splitting features and labels...")
    X = df.drop(columns=["Class"])
    y = df["Class"]

    logger.info("Splitting into train/validation/test...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.22, random_state=42)

    logger.info("Normalizing features...")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val =  scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    logger.info("Applying Downsampling to balance the dataset...")
    undersampler = RandomUnderSampler(sampling_strategy=0.1, random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

    train_df = pd.DataFrame(X_resampled)
    train_df["Class"] = y_resampled.values

    val_df = pd.DataFrame(X_val)
    val_df["Class"] = y_val.values

    test_df = pd.DataFrame(X_test)
    test_df["Class"] = y_test.values

    return train_df, val_df, test_df

def save_processed_data(train_df, val_df, test_df):
    logger.info("Saving processed datasets...")
    train_path = PROCESSED_DATA_DIR / "train.csv"
    val_path = PROCESSED_DATA_DIR / "val.csv"
    test_path = PROCESSED_DATA_DIR / "test.csv"

    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

    logger.info("Tracking processed data with DVC...")
    # subprocess.run(["dvc", "commit", train_path.as_posix()], check=True)
    # subprocess.run(["dvc", "commit", str(val_path)], check=True)
    # subprocess.run(["dvc", "commit", str(test_path)], check=True)
    subprocess.run(["dvc", "commit", "preprocess", "--force"], check=True)
    time.sleep(10)
    subprocess.run(["git", "add", "."], check=True)
    subprocess.run(["git", "commit", "-m", "Add processed datasets"], check=True)
    subprocess.run(["dvc", "push"], check=True)

def log_to_mlflow(stats, train_df, val_df, test_df):
    mlflow.log_param("train_size", len(train_df))
    mlflow.log_param("val_size", len(val_df))
    mlflow.log_param("test_size", len(test_df))
    mlflow.log_metrics({
        "class_ratio_train": train_df["Class"].mean(),
        "class_ratio_val": val_df["Class"].mean(),
        "class_ratio_test": test_df["Class"].mean()
    })

def main():
    args = parse_args()
    logger.info(f"Starting data preprocessing pipeline with data revision: {args.data_rev}")
    setup_directories()
    setup_mlflow()
    with mlflow.start_run():
        df = load_data(args.data_rev)
        stats = analyze_data(df)
        train_df, val_df, test_df = preprocess_data(df)
        save_processed_data(train_df, val_df, test_df)
        log_to_mlflow(stats, train_df, val_df, test_df)
    logger.info("Data preprocessing completed successfully")

if __name__ == "__main__":
    main()

2025-05-16 12:08:38,437 - data-preprocessing - INFO - Check parser
2025-05-16 12:08:38,437 - data-preprocessing - INFO - Check parser 1
2025-05-16 12:08:38,437 - data-preprocessing - INFO - Check parser 2
2025-05-16 12:08:38,437 - data-preprocessing - INFO - Starting data preprocessing pipeline with data revision: HEAD
2025-05-16 12:08:38,437 - data-preprocessing - INFO - Creating processed data directory: data\processed
2025-05-16 12:08:38,661 - data-preprocessing - INFO - Checking out raw data at DVC revision: HEAD
2025-05-16 12:08:40,353 - data-preprocessing - INFO - Loading dataset from data\raw\creditcard-data.csv
2025-05-16 12:08:41,751 - data-preprocessing - INFO - Data summary: {'num_rows': 284807, 'num_features': 31, 'num_fraud': 492, 'num_normal': 284315}
2025-05-16 12:08:41,752 - data-preprocessing - INFO - Splitting features and labels...
2025-05-16 12:08:41,765 - data-preprocessing - INFO - Splitting into train/validation/test...
2025-05-16 12:08:41,882 - data-preprocessin

In [3]:
#!/usr/bin/env python3
"""
Model training script for Credit Card Fraud Detection MLOps Pipeline.

This script:
1. Loads preprocessed data from a specific DVC version
2. Trains a Gradient Boosting model (XGBoost)
3. Performs hyperparameter tuning
4. Tracks experiments with MLflow
5. Registers the best model

Usage:
    python train.py --data-rev <DVC_REVISION>
"""

import os
import sys
import logging
import argparse
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix
)
import mlflow
import mlflow.xgboost
from mlflow.tracking import MlflowClient
import subprocess
from dotenv import load_dotenv

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger('model-training')

# Constants
DATA_DIR = Path("data")
PROCESSED_DATA_DIR = DATA_DIR / "processed"
PROCESSED_DATA_FILE_TRAIN = PROCESSED_DATA_DIR / "train.csv"
PROCESSED_DATA_FILE_VAL = PROCESSED_DATA_DIR / "val.csv"
MODELS_DIR = Path("models")
load_dotenv()

print("AWS_ACCESS_KEY_ID:", os.getenv("AWS_ACCESS_KEY_ID"))
print("AWS_SECRET_ACCESS_KEY:", os.getenv("AWS_SECRET_ACCESS_KEY"))
print("MLFLOW_S3_ENDPOINT_URL:", os.getenv("MLFLOW_S3_ENDPOINT_URL"))

def parse_args():
    parser = argparse.ArgumentParser(description='Model training script')
    parser.add_argument('--data-rev', type=str, required=False, default="HEAD",
                        help='(Optional) DVC revision/version of the processed data to use')
    return parser.parse_known_args()[0]


def setup_directories():
    MODELS_DIR.mkdir(parents=True, exist_ok=True)


def setup_mlflow():
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("credit-card-fraud-detection")


def load_data(data_rev):
    logger.info(f"Pulling data from DVC revision: {data_rev}")
    subprocess.run(["dvc", "pull", "--force", PROCESSED_DATA_FILE_TRAIN.as_posix()], check=True)
    subprocess.run(["dvc", "pull", "--force", PROCESSED_DATA_FILE_VAL.as_posix()], check=True)
    train_df = pd.read_csv(PROCESSED_DATA_DIR / "train.csv")
    val_df = pd.read_csv(PROCESSED_DATA_DIR / "val.csv")
    X_train = train_df.drop(columns=["Class"])
    y_train = train_df["Class"]
    X_val = val_df.drop(columns=["Class"])
    y_val = val_df["Class"]
    return X_train, y_train, X_val, y_val


def train_model(X_train, y_train, X_val, y_val):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

    param_dist = {
        "n_estimators": [100, 200, 300],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
        "subsample": [0.6, 0.8, 1.0]
    }

    logger.info("Starting hyperparameter tuning with RandomizedSearchCV...")
    search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10,
                                scoring='roc_auc', cv=3, verbose=1, n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    logger.info(f"Best hyperparameters: {search.best_params_}")
    return best_model, search.best_params_


def evaluate_model(model, X_val, y_val):
    logger.info("Evaluating model on validation data...")
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]
    metrics = {
        "accuracy": accuracy_score(y_val, y_pred),
        "precision": precision_score(y_val, y_pred),
        "recall": recall_score(y_val, y_pred),
        "f1": f1_score(y_val, y_pred),
        "roc_auc": roc_auc_score(y_val, y_proba),
        "avg_precision": average_precision_score(y_val, y_proba)
    }
    logger.info(f"Evaluation metrics: {metrics}")
    return metrics


def log_to_mlflow(model, params, metrics):
    with mlflow.start_run() as run:
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        mlflow.xgboost.log_model(model, "model")

        run_id = run.info.run_id
        model_uri = f"runs:/{run_id}/model"
        # Register the model to the MLflow Model Registry
        registered_model = mlflow.register_model(model_uri, "fraud-detection-model")

        # Transition the newly registered model version to "Staging"
        client = MlflowClient()
        client.transition_model_version_stage(
            name="fraud-detection-model",
            version=registered_model.version,
            stage="Staging",
            archive_existing_versions=True
        )

        logger.info(f"Model version {registered_model.version} registered and transitioned to Staging.")


def save_model(model):
    logger.info("Saving model to disk...")
    joblib.dump(model, MODELS_DIR / "model.joblib")


def main():
    args = parse_args()
    logger.info(f"Starting model training pipeline with data revision: {args.data_rev}")

    setup_directories()
    setup_mlflow()
    X_train, y_train, X_val, y_val = load_data(args.data_rev)
    model, best_params = train_model(X_train, y_train, X_val, y_val)
    metrics = evaluate_model(model, X_val, y_val)
    log_to_mlflow(model, best_params, metrics)
    save_model(model)

    logger.info("Model training completed successfully")


if __name__ == "__main__":
    main()

AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
MLFLOW_S3_ENDPOINT_URL: http://localhost:9000
2025-05-16 12:09:03,462 - model-training - INFO - Starting model training pipeline with data revision: HEAD
2025-05-16 12:09:03,483 - model-training - INFO - Pulling data from DVC revision: HEAD
2025-05-16 12:09:07,147 - model-training - INFO - Starting hyperparameter tuning with RandomizedSearchCV...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


2025-05-16 12:09:14,391 - model-training - INFO - Best hyperparameters: {'subsample': 0.6, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.01}
2025-05-16 12:09:14,392 - model-training - INFO - Evaluating model on validation data...
2025-05-16 12:09:14,494 - model-training - INFO - Evaluation metrics: {'accuracy': 0.9990069513406157, 'precision': 0.7165354330708661, 'recall': 0.8198198198198198, 'f1': 0.7647058823529411, 'roc_auc': np.float64(0.9793036582361577), 'avg_precision': np.float64(0.6733270646807173)}


  self.get_booster().save_model(fname)


2025-05-16 12:09:23,275 - botocore.credentials - INFO - Found credentials in environment variables.


Registered model 'fraud-detection-model' already exists. Creating a new version of this model...
2025/05/16 12:09:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: fraud-detection-model, version 3


2025-05-16 12:09:23,851 - model-training - INFO - Model version 3 registered and transitioned to Staging.
🏃 View run hilarious-wolf-168 at: http://localhost:5000/#/experiments/2/runs/6aeffda7346047eaa7e412a23ff57f2c
🧪 View experiment at: http://localhost:5000/#/experiments/2
2025-05-16 12:09:23,891 - model-training - INFO - Saving model to disk...
2025-05-16 12:09:23,895 - model-training - INFO - Model training completed successfully


Created version '3' of model 'fraud-detection-model'.
  client.transition_model_version_stage(


In [4]:
#!/usr/bin/env python3
"""
Model evaluation and validation script for Credit Card Fraud Detection MLOps Pipeline.
"""

import os
import sys
import logging
import argparse
import subprocess
from pathlib import Path
from typing import Dict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import mlflow
import mlflow.pyfunc
from fastapi import FastAPI, HTTPException

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    roc_curve
)

from fastapi import FastAPI
from pydantic import BaseModel

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("model-validation")

# Constants
DATA_DIR = Path("data")
PROCESSED_DATA_DIR = DATA_DIR / "processed"
PROCESSED_DATA_FILE_TEST = PROCESSED_DATA_DIR / "test.csv"

MODELS_DIR = Path("models")
VALIDATION_DIR = MODELS_DIR / "validation"
VALIDATION_DIR.mkdir(parents=True, exist_ok=True)

PERFORMANCE_REQUIREMENTS = {
    "min_accuracy": 0.98,
    "min_precision": 0.85,
    "min_recall": 0.70,
    "min_f1": 0.75,
    "min_roc_auc": 0.95
}

class InferenceInput(BaseModel):
    inputs: Dict[str, float]

def parse_args():
    parser = argparse.ArgumentParser(description='Model validation script')
    parser.add_argument('--model-version', type=str, required=False, default="Staging")
    parser.add_argument('--data-rev', type=str, required=False, default="HEAD")
    parser.add_argument('--start-api', action='store_true')
    return parser.parse_args()

def setup_mlflow():
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("credit-card-fraud-detection")

def load_model(model_version: str):
    logger.info(f"Loading model version '{model_version}' from MLflow registry...")
    return mlflow.xgboost.load_model(f"models:/fraud-detection-model/{model_version}")

def load_test_data(data_rev: str):
    logger.info(f"Pulling test data from DVC revision: {data_rev}")
    subprocess.run(["dvc", "pull", "--force", PROCESSED_DATA_FILE_TEST.as_posix()], check=True)

    test_df = pd.read_csv(PROCESSED_DATA_DIR / "test.csv")
    X_test = test_df.drop(columns=["Class"])
    y_test = test_df["Class"]
    return X_test, y_test

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba),
        "avg_precision": average_precision_score(y_test, y_proba),
    }

    return metrics, y_pred, y_proba

def validate_performance(metrics: Dict[str, float]):
    failed_metrics = [
        key for key, val in PERFORMANCE_REQUIREMENTS.items()
        if metrics.get(key.replace("min_", ""), 0) < val
    ]
    if failed_metrics:
        logger.warning(f"Model failed to meet performance requirements for: {failed_metrics}")
        return False
    logger.info("Model passed all performance requirements.")
    return True

def create_visualizations(y_test, y_pred, y_proba):
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(5, 4))
    ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.6)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    cm_path = VALIDATION_DIR / "confusion_matrix.png"
    plt.savefig(cm_path)
    plt.close()

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure()
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    roc_path = VALIDATION_DIR / "roc_curve.png"
    plt.savefig(roc_path)
    plt.close()

    return [str(cm_path), str(roc_path)]

def log_to_mlflow(metrics, artifacts, model_version, requirements_passed):
    mlflow.log_param("validated_model_version", model_version)
    mlflow.log_metrics(metrics)
    for artifact in artifacts:
        mlflow.log_artifact(artifact, artifact_path="validation")
    mlflow.set_tag("validation_passed", requirements_passed)

def setup_api(model):
    app = FastAPI()

    @app.get("/")
    def root():
        return {"message": "Fraud Detection Model Inference API"}

    @app.post("/predict")
    def predict(input_data: InferenceInput):
        try:
            X = pd.DataFrame([input_data.inputs])
            prediction = model.predict(X)[0]
            probability = model.predict_proba(X)[0][1]
            return {"prediction": int(prediction), "probability": float(probability)}
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))

    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

def main():
    import sys
    sys.argv = [sys.argv[0]]
    args = parse_args()
    setup_mlflow()

    with mlflow.start_run(run_name="validation"):
        model = load_model(args.model_version)
        X_test, y_test = load_test_data(args.data_rev)
        metrics, y_pred, y_proba = evaluate_model(model, X_test, y_test)
        requirements_passed = validate_performance(metrics)
        artifact_paths = create_visualizations(y_test, y_pred, y_proba)
        log_to_mlflow(metrics, artifact_paths, args.model_version, requirements_passed)

        logger.info("Validation metrics:")
        for k, v in metrics.items():
            logger.info(f"{k}: {v:.4f}")

        if args.start_api:
            logger.info("Starting model inference API...")
            setup_api(model)

    logger.info("Model validation pipeline completed.")

if __name__ == "__main__":
    main()

2025-05-16 12:10:13,205 - model-validation - INFO - Loading model version 'Staging' from MLflow registry...


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 2407.48it/s]

2025-05-16 12:10:13,436 - model-validation - INFO - Pulling test data from DVC revision: HEAD





2025-05-16 12:10:15,746 - model-validation - INFO - Validation metrics:
2025-05-16 12:10:15,748 - model-validation - INFO - accuracy: 0.9992
2025-05-16 12:10:15,748 - model-validation - INFO - precision: 0.7115
2025-05-16 12:10:15,748 - model-validation - INFO - recall: 0.8043
2025-05-16 12:10:15,749 - model-validation - INFO - f1_score: 0.7551
2025-05-16 12:10:15,749 - model-validation - INFO - roc_auc: 0.9843
2025-05-16 12:10:15,749 - model-validation - INFO - avg_precision: 0.6824
🏃 View run validation at: http://localhost:5000/#/experiments/2/runs/253cc7e42e894c4b985e6f9eac1d12a0
🧪 View experiment at: http://localhost:5000/#/experiments/2
2025-05-16 12:10:15,827 - model-validation - INFO - Model validation pipeline completed.
