## 1. Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from mlvern.core.forge import Forge

print("✓ All libraries imported successfully")

✓ All libraries imported successfully


## 2. Initialize MLVern Project

In [2]:
# Get examples directory as base
base_dir = str(Path.cwd())

# Initialize Forge
forge = Forge(project="iris_classification", base_dir=base_dir)
forge.init()

print(f"✓ Project initialized at: {forge.mlvern_dir}")
print(f"  - Datasets directory: {forge.mlvern_dir}/datasets")
print(f"  - Runs directory: {forge.mlvern_dir}/runs")

✓ Project initialized at: d:\ml-vern\examples\.mlvern_iris_classification
  - Datasets directory: d:\ml-vern\examples\.mlvern_iris_classification/datasets
  - Runs directory: d:\ml-vern\examples\.mlvern_iris_classification/runs


## 3. Load and Prepare Dataset

In [3]:
# Load Iris dataset
data = load_iris(as_frame=True)
df = data.frame
target = "target"

print(f"Dataset shape: {df.shape}")
print(f"\nFeatures: {list(df.columns[:-1])}")
print(f"Target: {target}")
print(f"\nFirst few rows:")
df.head()

Dataset shape: (150, 5)

Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target: target

First few rows:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## 4. Register Dataset (with Automatic EDA)

In [4]:
# Register dataset - this will automatically:
# - Fingerprint the dataset
# - Inspect data quality
# - Compute statistics
# - Run risk checks
# - Generate EDA plots

dataset_fp, is_new = forge.register_dataset(df, target)

print(f"Dataset Registration:")
print(f"  - Hash: {dataset_fp['dataset_hash']}")
print(f"  - Rows: {dataset_fp['rows']}")
print(f"  - Columns: {dataset_fp['columns']}")
print(f"  - New registration: {is_new}")
print(f"\nDataset artifacts stored at:")
print(f"  {forge.mlvern_dir}/datasets/{dataset_fp['dataset_hash']}/")

Dataset Registration:
  - Hash: 42c467344f49
  - Rows: 150
  - Columns: 5
  - New registration: True

Dataset artifacts stored at:
  d:\ml-vern\examples\.mlvern_iris_classification/datasets/42c467344f49/


## 5. List Registered Datasets

In [5]:
datasets = forge.list_datasets()

print(f"Total registered datasets: {len(datasets)}")
print(f"\nDataset Registry:")
for ds_hash, info in datasets.items():
    print(f"\n  Hash: {ds_hash}")
    print(f"    - Rows: {info['rows']}")
    print(f"    - Columns: {info['columns']}")
    print(f"    - Target: {info['target']}")
    print(f"    - Created: {info['created_at'][:19]}")

Total registered datasets: 1

Dataset Registry:

  Hash: 42c467344f49
    - Rows: 150
    - Columns: 5
    - Target: target
    - Created: 2025-12-29T17:50:29


## 6. Prepare Train/Validation Split

In [6]:
# Split data
X = df.drop(columns=[target])
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Data Split:")
print(f"  - Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"  - Validation set: {X_val.shape[0]} samples, {X_val.shape[1]} features")
print(f"  - Class distribution (train): {y_train.value_counts().to_dict()}")
print(f"  - Class distribution (val): {y_val.value_counts().to_dict()}")

Data Split:
  - Training set: 120 samples, 4 features
  - Validation set: 30 samples, 4 features
  - Class distribution (train): {1: 41, 0: 40, 2: 39}
  - Class distribution (val): {2: 11, 0: 10, 1: 9}


## 7. Train Model 1: Logistic Regression

In [7]:
# Create and train model
lr_model = LogisticRegression(max_iter=200, random_state=42)

config_lr = {
    "model_type": "LogisticRegression",
    "max_iter": 200,
    "solver": "lbfgs",
    "random_state": 42,
}

run_id_1, metrics_1 = forge.run(
    lr_model, X_train, y_train, X_val, y_val, config_lr, dataset_fp
)

print(f"✓ Model 1 Trained (Logistic Regression)")
print(f"  - Run ID: {run_id_1}")
print(f"  - Validation Accuracy: {metrics_1['accuracy']:.4f}")
print(f"  - Run stored at: {forge.mlvern_dir}/runs/{run_id_1}/")

✓ Model 1 Trained (Logistic Regression)
  - Run ID: run_2025-12-29_17-51-40-049918
  - Validation Accuracy: 1.0000
  - Run stored at: d:\ml-vern\examples\.mlvern_iris_classification/runs/run_2025-12-29_17-51-40-049918/


## 8. Train Model 2: Random Forest

In [8]:
# Create and train second model
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)

config_rf = {
    "model_type": "RandomForestClassifier",
    "n_estimators": 50,
    "max_depth": 10,
    "random_state": 42,
}

run_id_2, metrics_2 = forge.run(
    rf_model, X_train, y_train, X_val, y_val, config_rf, dataset_fp
)

print(f"✓ Model 2 Trained (Random Forest)")
print(f"  - Run ID: {run_id_2}")
print(f"  - Validation Accuracy: {metrics_2['accuracy']:.4f}")
print(f"  - Run stored at: {forge.mlvern_dir}/runs/{run_id_2}/")

✓ Model 2 Trained (Random Forest)
  - Run ID: run_2025-12-29_17-51-58-816913
  - Validation Accuracy: 1.0000
  - Run stored at: d:\ml-vern\examples\.mlvern_iris_classification/runs/run_2025-12-29_17-51-58-816913/


## 9. Track and Compare Runs

In [9]:
runs = forge.list_runs()

print(f"Total training runs: {len(runs)}\n")

run_data = []
for run_id, run_info in runs.items():
    run_data.append({
        "Run ID": run_id[:20] + "...",
        "Model": run_info['model'],
        "Accuracy": f"{run_info['metrics']['accuracy']:.4f}",
        "Created": run_info['created_at'][:19],
        "Full ID": run_id
    })

runs_df = pd.DataFrame(run_data)
print("Run Registry:")
display(runs_df[['Run ID', 'Model', 'Accuracy', 'Created']])

Total training runs: 2

Run Registry:


Unnamed: 0,Run ID,Model,Accuracy,Created
0,run_2025-12-29_17-51...,LogisticRegression,1.0,2025-12-29T17:51:40
1,run_2025-12-29_17-51...,RandomForestClassifier,1.0,2025-12-29T17:51:58


## 10. Test Dataset Caching

In [10]:
# Register same dataset again
dataset_fp_2, is_new_2 = forge.register_dataset(df, target)

print(f"Dataset Caching Test:")
print(f"  - First hash: {dataset_fp['dataset_hash']}")
print(f"  - Second hash: {dataset_fp_2['dataset_hash']}")
print(f"  - Hashes match: {dataset_fp['dataset_hash'] == dataset_fp_2['dataset_hash']}")
print(f"  - New registration required: {is_new_2}")
print(f"\n✓ Dataset caching works! Duplicate datasets are detected via hashing.")

Dataset Caching Test:
  - First hash: 42c467344f49
  - Second hash: 42c467344f49
  - Hashes match: True
  - New registration required: False

✓ Dataset caching works! Duplicate datasets are detected via hashing.


## 11. Register Different Dataset

In [11]:
# Create and register synthetic dataset
np.random.seed(42)
df_synthetic = pd.DataFrame({
    "feature1": np.random.randn(100),
    "feature2": np.random.randn(100),
    "feature3": np.random.randn(100),
    "target": np.random.randint(0, 3, 100),
})

dataset_fp_3, is_new_3 = forge.register_dataset(df_synthetic, "target")

print(f"Synthetic Dataset Registered:")
print(f"  - Hash: {dataset_fp_3['dataset_hash']}")
print(f"  - Different from Iris: {dataset_fp['dataset_hash'] != dataset_fp_3['dataset_hash']}")
print(f"  - Shape: {df_synthetic.shape}")

Synthetic Dataset Registered:
  - Hash: e47a0b0f549d
  - Different from Iris: True
  - Shape: (100, 4)


## 12. Final Summary

In [12]:
datasets_final = forge.list_datasets()
runs_final = forge.list_runs()

print("="*70)
print("MLVERN WORKFLOW SUMMARY")
print("="*70)
print(f"\nProject: iris_classification")
print(f"Location: {forge.mlvern_dir}")
print(f"\nProject Statistics:")
print(f"  - Registered datasets: {len(datasets_final)}")
print(f"  - Training runs: {len(runs_final)}")
print(f"\nBest Performing Model:")
best_run = max(runs_final.items(), key=lambda x: x[1]["metrics"]["accuracy"])
print(f"  - Model: {best_run[1]['model']}")
print(f"  - Accuracy: {best_run[1]['metrics']['accuracy']:.4f}")
print(f"  - Run ID: {best_run[0]}")
print(f"\nDataset Artifacts Generated:")
print(f"  ✓ Data inspection reports")
print(f"  ✓ Statistical analyses")
print(f"  ✓ Risk assessment reports")
print(f"  ✓ EDA plots (distributions, correlations, etc.)")
print(f"  ✓ Trained model artifacts")
print(f"\n✓ Complete mlvern workflow demonstrated successfully!")
print("="*70)

MLVERN WORKFLOW SUMMARY

Project: iris_classification
Location: d:\ml-vern\examples\.mlvern_iris_classification

Project Statistics:
  - Registered datasets: 2
  - Training runs: 2

Best Performing Model:
  - Model: LogisticRegression
  - Accuracy: 1.0000
  - Run ID: run_2025-12-29_17-51-40-049918

Dataset Artifacts Generated:
  ✓ Data inspection reports
  ✓ Statistical analyses
  ✓ Risk assessment reports
  ✓ EDA plots (distributions, correlations, etc.)
  ✓ Trained model artifacts

✓ Complete mlvern workflow demonstrated successfully!
