# HEK Dataset Preparation

Prepare a dataset, compute hash, and attach dataset metadata to MLflow runs.

In [None]:
import hashlib
from pathlib import Path

import pandas as pd

In [None]:
df = pd.DataFrame(
    [
        {"inputs": "Reset my password", "expectations": "Follow password reset flow", "segment": "support"},
        {"inputs": "What is your refund policy?", "expectations": "Explain 30-day refund policy", "segment": "billing"},
    ]
)
path = Path("dataset_eval.csv")
df.to_csv(path, index=False)
df.head()

In [None]:
dataset_hash = "sha256:" + hashlib.sha256(path.read_bytes()).hexdigest()
dataset_id = "support-eval-v1"
num_samples = len(df)
dataset_hash, dataset_id, num_samples

In [None]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("hek-notebook-dataset")

with mlflow.start_run(run_name="dataset-metadata"):
    mlflow.set_tag("hokusai.dataset.id", dataset_id)
    mlflow.set_tag("hokusai.dataset.hash", dataset_hash)
    mlflow.set_tag("hokusai.dataset.num_samples", str(num_samples))
    mlflow.set_tag("hokusai.primary_metric", "accuracy")

print("dataset metadata logged")