# 03 — DagsHub Tracking

This notebook loads the cleaned dataset (`data/processed/survey_cleaned.csv`), rebuilds the preprocessing pipeline, trains multiple models, and logs runs & registered models to **DagsHub's MLflow**.

**Prereqs**

- You have already created `survey_cleaned.csv` in `01_eda.ipynb`.

- Place the file under `data/processed/survey_cleaned.csv`.

- You have a DagsHub repo with MLflow enabled and a Personal Access Token (PAT).



## 0) Environment & installs

In [14]:
# If needed, install these (uncomment)
# !pip install mlflow xgboost

import os, json, tempfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier

import mlflow
import mlflow.sklearn
import mlflow.xgboost

print('Versions -> pandas', pd.__version__, '| sklearn loaded | mlflow', mlflow.__version__)

Versions -> pandas 2.3.2 | sklearn loaded | mlflow 3.3.1


## 1) Configure DagsHub MLflow Tracking

In [None]:
# --- Set your DagsHub credentials here (recommended to keep in env vars) ---



print('MLFLOW_TRACKING_URI =', os.environ.get('MLFLOW_TRACKING_URI'))
print('MLFLOW_TRACKING_USERNAME =', os.environ.get('MLFLOW_TRACKING_USERNAME'))
print('MLFLOW_TRACKING_PASSWORD set =', bool(os.environ.get('MLFLOW_TRACKING_PASSWORD')))

mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI', ''))

# Optional: name your experiment
mlflow.set_experiment('price_range_models')

In [12]:
import mlflow
print("mlflow version:", mlflow.__version__)

mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
mlflow.set_experiment("price_range_models")

with mlflow.start_run(run_name="ping"):
    mlflow.log_metric("ping", 1.0)
print("Run OK")


mlflow version: 3.3.1
🏃 View run ping at: https://dagshub.com/mon.mamilla/ml-project-3.mlflow/#/experiments/0/runs/a84e0105943a4601ba88ae43c6ae7e78
🧪 View experiment at: https://dagshub.com/mon.mamilla/ml-project-3.mlflow/#/experiments/0
Run OK


## 2) Load cleaned dataset

In [5]:
DATA_PATH = '../data/processed/survey_cleaned.csv'  # updated path
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

(29956, 20)


Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score,zas_score,bsi
0,R00001,M,Urban,Working Professional,<10L,3-4 times,Newcomer,Medium (500 ml),0 to 1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67,3,1
1,R00002,F,Metro,Working Professional,> 35L,5-7 times,Established,Medium (500 ml),2 to 4,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6,20,0
2,R00003,F,Rural,Working Professional,> 35L,3-4 times,Newcomer,Medium (500 ml),2 to 4,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45,0.5,5,0
3,R00004,F,Urban,Working Professional,16L - 25L,5-7 times,Newcomer,Medium (500 ml),0 to 1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35,0.75,9,0
4,R00005,M,Metro,Student,Not Reported,3-4 times,Established,Medium (500 ml),0 to 1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25,0.67,0,0


## 3) Define preprocessing pipeline (must match 02_model_development)

In [6]:
# Target and columns
target_col = 'price_range'
drop_cols = ['respondent_id', target_col]

X = df.drop(columns=drop_cols)
y = df[target_col]

# Columns that you wanted ordinal/label-encoding
label_cols_explicit = ['age_group', 'consume_frequency(weekly)']
label_cols_auto     = ['income_levels', 'health_concerns', 'preferable_consumption_size']

# Everything else categorical -> One-Hot Encode
all_cats = X.select_dtypes(include=['object', 'category']).columns.tolist()
nominal_cols = [c for c in all_cats if c not in (label_cols_explicit + label_cols_auto)]

# Numeric passthrough (if present)
numeric_cols = [c for c in X.columns if c not in (label_cols_explicit + label_cols_auto + nominal_cols)]

print('Ordinal explicit :', label_cols_explicit)
print('Ordinal auto     :', label_cols_auto)
print('One-hot nominal  :', nominal_cols)
print('Numeric passthru :', numeric_cols)

Ordinal explicit : ['age_group', 'consume_frequency(weekly)']
Ordinal auto     : ['income_levels', 'health_concerns', 'preferable_consumption_size']
One-hot nominal  : ['gender', 'zone', 'occupation', 'current_brand', 'awareness_of_other_brands', 'reasons_for_choosing_brands', 'flavor_preference', 'purchase_channel', 'packaging_preference', 'typical_consumption_situations']
Numeric passthru : ['cf_ab_score', 'zas_score', 'bsi']


In [7]:
# Ordinal categories (explicit) — adjust if your labels differ
age_group_order = ["18-25", "26-35", "36-45", "46-55", "56-70", "70+"]
consume_freq_order = ["0-2 times", "3-4 times", "5-7 times"]

ord_explicit = OrdinalEncoder(
    categories=[age_group_order, consume_freq_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

ord_auto = OrdinalEncoder(
    categories='auto',
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('ord_explicit', ord_explicit, label_cols_explicit),
        ('ord_auto',     ord_auto,     label_cols_auto),
        ('onehot',       ohe,          nominal_cols),
        # numeric columns pass through via remainder
    ],
    remainder='passthrough'
)

preprocessor

## 4) Encode target and train/test split

In [8]:
# Encode y (price_range) once for consistent classes across models
le_y = LabelEncoder()
y_enc = le_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)

print('Train:', X_train.shape, ' Test:', X_test.shape)
print('Classes:', list(le_y.classes_))

Train: (22467, 18)  Test: (7489, 18)
Classes: ['100-150', '150-200', '200-250', '50-100']


## 5) Helper functions for MLflow logging

In [9]:
def log_confusion_matrix(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred, labels=range(len(labels)))
    fig, ax = plt.subplots()
    im = ax.imshow(cm, aspect='auto')
    ax.figure.colorbar(im, ax=ax)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha='right'); ax.set_yticklabels(labels)
    ax.set_title(title); ax.set_xlabel('Predicted'); ax.set_ylabel('True')
    fig.tight_layout()
    tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
    plt.savefig(tmp.name, bbox_inches='tight'); plt.close(fig)
    mlflow.log_artifact(tmp.name, artifact_path='plots')

def train_and_log(name, estimator):
    mlflow.sklearn.autolog(disable=False, log_input_examples=False)
    mlflow.xgboost.autolog(disable=False)

    with mlflow.start_run(run_name=name):
        pipe = Pipeline([('prep', preprocessor), ('model', estimator)])
        pipe.fit(X_train, y_train)

        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(
            le_y.inverse_transform(y_test),
            le_y.inverse_transform(y_pred),
            target_names=le_y.classes_,
            output_dict=True
        )

        mlflow.log_metric('accuracy', acc)
        mlflow.log_text(json.dumps(report, indent=2), 'reports/classification_report.json')
        log_confusion_matrix(y_test, y_pred, labels=list(le_y.classes_), title=f'Confusion Matrix — {name}')

        model_name = f'price-range-{name}'
        mlflow.sklearn.log_model(
            sk_model=pipe,
            artifact_path='model',
            registered_model_name=model_name
        )

        mlflow.set_tag('framework', 'sklearn')
        mlflow.set_tag('dataset', 'survey_cleaned')
        mlflow.set_tag('target', 'price_range')
        mlflow.set_tag('algo', name)

        print(f'{name}: accuracy={acc:.4f} | registered as {model_name}')

## 6) Train and log multiple models

In [10]:
models = {
    'LogReg': LogisticRegression(max_iter=1000, n_jobs=-1),
    'RF': RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    'GB': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(kernel='rbf', probability=False, random_state=42),
    'GNB': GaussianNB(),
    'XGBoost': XGBClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, eval_metric='mlogloss'
    )
}

for name, est in models.items():
    train_and_log(name, est)



🏃 View run LogReg at: https://dagshub.com/mon.mamilla/ml-project-3.mlflow/#/experiments/0/runs/2e3a4a30f02647658b7b0f7d7d9451d6
🧪 View experiment at: https://dagshub.com/mon.mamilla/ml-project-3.mlflow/#/experiments/0


RestException: INTERNAL_ERROR: Response: {'error': 'unsupported endpoint, please contact support@dagshub.com'}

## 7) View results in DagsHub

Open your MLflow UI:
```
https://dagshub.com/<USERNAME>/<REPO>.mlflow
```
- **Experiments** → each model run with metrics and artifacts
- **Models** → each registered model (e.g., `price-range-RF`) → promote a version to Staging/Production

To serve a production model locally:
```bash
mlflow models serve -m "models:/price-range-RF@Production" -p 8080 --env-manager=local
```