In [41]:
import mlflow
import mlflow.sklearn

from mlflow.tracking import MlflowClient

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_absolute_error


In [42]:
df = pd.read_csv('data/used_car_canada_clean.csv')
df.head()

Unnamed: 0,miles,year,make,model,engine_size,state,price
0,9966.0,2017.0,acura,NSX,3.5,NB,179999.0
1,5988.0,2017.0,acura,NSX,3.5,QC,179995.0
2,24242.0,2017.0,acura,NSX,3.5,BC,168528.0
3,6637.0,2020.0,acura,NSX,3.5,ON,220000.0
4,6637.0,2020.0,acura,NSX,3.5,ON,220000.0


In [43]:
df["miles"]=df["miles"].astype(int)
df["year"]=df["year"].astype(int)

In [44]:
df.isna().sum()

miles          0
year           0
make           0
model          0
engine_size    0
state          0
price          0
dtype: int64

In [45]:
df.dtypes

miles            int64
year             int64
make            object
model           object
engine_size    float64
state           object
price          float64
dtype: object

### MLflow setup


### Run mlflow with 
mlflow ui -p 5000 --host 0.0.0.0

In [None]:
experiment_description = (
    "Fast baseline models with numeric scaling and categorical encoding "
    "for Used Cars in Canada price prediction."
)

experiment_tags = {
    "project_name": "Used Car Analysis in Canada",
    "dataset": "used_car_canada",
    "team": "MLOps",
    "mlflow.note.content": experiment_description,
}

client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment("Used-Car-Price-Prediction-Fast")


2025/12/30 16:16:39 INFO mlflow.tracking.fluent: Experiment with name 'Used-Car-Price-Prediction-Fast' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/138320259653279386', creation_time=1767107799226, experiment_id='138320259653279386', last_update_time=1767107799226, lifecycle_stage='active', name='Used-Car-Price-Prediction-Fast', tags={}>

RUN MLflow mlflow ui -p 600


In [47]:
from sklearn.model_selection import train_test_split

X = df.drop(['price'], axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df[['make']], test_size=0.2, shuffle=True, random_state=42)

In [48]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numeric_cols = X.select_dtypes(exclude=['object', 'category']).columns

In [49]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ],
    remainder='drop'
)



In [50]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
}



## Train, evaluate & log to MLflow 

In [51]:
from mlflow.models.signature import infer_signature

results = {}

for model_name, regressor in models.items():

    with mlflow.start_run(run_name=model_name):

        pipeline = Pipeline(
            steps=[
                ('preprocessor', preprocessor),
                ('regressor', regressor)
            ]
        )

        # Train
        pipeline.fit(X_train, y_train)

        # Predictions
        y_train_pred = pipeline.predict(X_train)
        y_test_pred = pipeline.predict(X_test)

        # Metrics
        metrics = {
            "train_r2": r2_score(y_train, y_train_pred),
            "test_r2": r2_score(y_test, y_test_pred),
            "train_mae": mean_absolute_error(y_train, y_train_pred),
            "test_mae": mean_absolute_error(y_test, y_test_pred)
        }

        # Log params & metrics
        mlflow.log_param("model_type", model_name)
        mlflow.log_metrics(metrics)

        # Create model signature
        signature = infer_signature(X_test, y_test_pred)
        input_example = X_test.head(3)

        # Log model with name, signature, and input example
        mlflow.sklearn.log_model(
            pipeline,
            name="model",
            signature=signature,
            input_example=input_example
        )

        results[model_name] = metrics["test_r2"]

        print(f"{model_name} | Test R¬≤: {metrics['test_r2']:.4f}")




LinearRegression | Test R¬≤: 0.8473
üèÉ View run LinearRegression at: http://127.0.0.1:600/#/experiments/138320259653279386/runs/829b4a724c2e4f76b46cd24a0e5539ad
üß™ View experiment at: http://127.0.0.1:600/#/experiments/138320259653279386




Ridge | Test R¬≤: 0.8465
üèÉ View run Ridge at: http://127.0.0.1:600/#/experiments/138320259653279386/runs/5439b05b48204b209a3ae3c6bf4d7bcc
üß™ View experiment at: http://127.0.0.1:600/#/experiments/138320259653279386


In [52]:
best_model = max(results, key=results.get)

print("\nFast model comparison:")
for name, score in results.items():
    print(f"{name}: {score:.4f}")

print(f"\nBest fast model: {best_model}")



Fast model comparison:
LinearRegression: 0.8473
Ridge: 0.8465

Best fast model: LinearRegression


In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient(tracking_uri="http://127.0.0.1:5000")

# 1. Identify the best model
best_model_name = max(results, key=results.get)
best_run = mlflow.search_runs(
    filter_string=f"tags.mlflow.runName = '{best_model_name}'"
).iloc[0]

run_id = best_run.run_id

# 2. Register the model
model_registry_name = "UsedCarPricePredictor"  # choose your registry name

model_uri = f"runs:/{run_id}/model"

registered_model = mlflow.register_model(
    model_uri=model_uri,
    name=model_registry_name
)

print(f"Model '{best_model_name}' registered as '{model_registry_name}'")


Successfully registered model 'UsedCarPricePredictor'.


2025/12/30 16:18:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: UsedCarPricePredictor, version 1


Model 'LinearRegression' registered as 'UsedCarPricePredictor'


Created version '1' of model 'UsedCarPricePredictor'.


In [54]:
client.transition_model_version_stage(
    name=model_registry_name,
    version=registered_model.version,
    stage="Production",
    archive_existing_versions=True
)

print(f"Model version {registered_model.version} is now in Production")


Model version 1 is now in Production


  client.transition_model_version_stage(


## Load the model from the registry

In [59]:
import mlflow.pyfunc


model_name = "UsedCarPricePredictor"
stage = "Production"

model_uri = f"models:/{model_name}/{stage}"

# Load the model
model = mlflow.pyfunc.load_model(model_uri)


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 98.62it/s] 


In [60]:
dummy_input = pd.DataFrame([{
    "miles": 12000,      
    "year": 2018,        
    "engine_size": 2.0,     
    "make": "acura",      
    "model": "NSX",        
    "state": "ON"        
}])

In [61]:
predicted_price = model.predict(dummy_input)
print(f"Predicted price: {predicted_price[0]:.2f}")


Predicted price: 175831.69
