In [1]:
import mlflow
import mlflow.sklearn
import joblib
import datetime
import numpy as np
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from mlflow.models import infer_signature

# Set MLflow tracking URI
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000/"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Load the latest model from MLflow
MODEL_URI = "models:/linear_regression_model/latest"
model = mlflow.pyfunc.load_model(MODEL_URI)

# Load original training dataset
df_original = pd.read_csv("../data/processed/2024/train.csv", parse_dates=['Trip_Start', 'Trip_End'])

# Load stored new data from API inputs
new_data_file = "../api/saved_inputs/prediction_inputs.csv"

if os.path.exists(new_data_file):
    df_new = pd.read_csv(new_data_file)
    print(f"Loaded {len(df_new)} new data points for retraining.")
else:
    df_new = None
    print("No new data found. Skipping retraining.")

# Sample 10% of original data for fairness
df_original = df_original.sample(frac=0.1, random_state=42)

# Feature Engineering on original dataset
df_original['Trip_Duration'] = (df_original['Trip_End'] - df_original['Trip_Start']).dt.total_seconds()
df_original['Start_Hour'] = df_original['Trip_Start'].dt.hour
df_original['Start_DayOfWeek'] = df_original['Trip_Start'].dt.dayofweek

df_original = df_original.dropna(subset=['Trip_Duration', 'Year_of_Birth', 'Gender', 'Origin_Id', 'Destination_Id'])

  latest = client.get_latest_versions(name, None if stage is None else [stage])


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 - mlflow (current: 2.20.0, required: mlflow==2.21.2)
 - numpy (current: 1.26.4, required: numpy==2.1.2)
 - pandas (current: 2.2.2, required: pandas==2.2.3)
 - psutil (current: 5.9.0, required: psutil==6.0.0)
 - scikit-learn (current: 1.6.0, required: scikit-learn==1.5.2)
 - scipy (current: 1.15.0, required: scipy==1.14.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loaded 3 new data points for retraining.


In [2]:
df_original.head()

Unnamed: 0,Trip_Id,User_Id,Gender,Year_of_Birth,Trip_Start,Trip_End,Origin_Id,Destination_Id,Trip_Duration,Start_Hour,Start_DayOfWeek
3825046,36217695,4052665,1,1972.0,2024-08-31 23:59:16,2024-09-01 00:08:19,201,195,543.0,23,5
4059787,36485768,132245,1,1979.0,2024-11-12 08:47:14,2024-11-12 08:58:41,249,82,687.0,8,1
2667596,34900510,620240,1,1996.0,2024-07-28 11:11:52,2024-07-28 11:42:16,48,94,1824.0,11,6
1188885,33170807,2563762,1,1967.0,2024-04-02 18:58:31,2024-04-02 19:16:39,196,52,1088.0,18,1
2680072,34914717,1173753,1,1974.0,2024-07-29 16:53:25,2024-07-29 17:17:25,11,11,1440.0,16,0


In [3]:
df_new.head()

Unnamed: 0,Year_of_Birth,Gender,Origin_Id,Destination_Id,Start_Hour,Start_DayOfWeek,Prediction,Timestamp
0,1995.0,1,10,50,12,4,609.097837,2025-04-01 23:57:36
1,1995.0,1,10,50,12,4,609.097837,2025-04-01 23:58:56
2,1995.0,1,10,50,12,4,609.097837,2025-04-01 23:59:01


In [4]:
# **Standardizing `df_new` to match `df_original`**
if df_new is not None:
    # Rename 'Prediction' to 'Trip_Duration'
    df_new = df_new.rename(columns={'Prediction': 'Trip_Duration'})

    # Add missing columns with placeholders
    df_new['Trip_Id'] = -1  # Dummy ID
    df_new['User_Id'] = -1  # Dummy User

    # **Reconstruct `Trip_Start` based on Start_Hour & Start_DayOfWeek**
    current_year = datetime.datetime.now().year
    df_new['Trip_Start'] = df_new.apply(
        lambda row: pd.Timestamp(
            year=current_year,
            month=1,  # Placeholder month
            day=row['Start_DayOfWeek'] + 1,  # Assuming Monday=0, Sunday=6
            hour=row['Start_Hour']
        ),
        axis=1
    )

    # **Calculate `Trip_End`**
    df_new['Trip_End'] = df_new['Trip_Start'] + pd.to_timedelta(df_new['Trip_Duration'], unit='s')

    # Reorder columns to match `df_original`
    df_new = df_new[df_original.columns]

    # Concatenate original and new data
    df_combined = pd.concat([df_original, df_new], ignore_index=True)

    # Drop rows where `Trip_Duration` is NaN
    df_combined = df_combined.dropna(subset=['Trip_Duration'])

else:
    df_combined = df_original



In [5]:
df_combined

Unnamed: 0,Trip_Id,User_Id,Gender,Year_of_Birth,Trip_Start,Trip_End,Origin_Id,Destination_Id,Trip_Duration,Start_Hour,Start_DayOfWeek
0,36217695,4052665,1,1972.0,2024-08-31 23:59:16,2024-09-01 00:08:19.000000000,201,195,543.000000,23,5
1,36485768,132245,1,1979.0,2024-11-12 08:47:14,2024-11-12 08:58:41.000000000,249,82,687.000000,8,1
2,34900510,620240,1,1996.0,2024-07-28 11:11:52,2024-07-28 11:42:16.000000000,48,94,1824.000000,11,6
3,33170807,2563762,1,1967.0,2024-04-02 18:58:31,2024-04-02 19:16:39.000000000,196,52,1088.000000,18,1
4,34914717,1173753,1,1974.0,2024-07-29 16:53:25,2024-07-29 17:17:25.000000000,11,11,1440.000000,16,0
...,...,...,...,...,...,...,...,...,...,...,...
430583,35885685,166714,1,1996.0,2024-08-31 23:59:16,2024-09-01 00:08:19.000000000,321,193,543.000000,23,5
430584,34744662,2522660,1,1993.0,2024-07-16 11:05:25,2024-07-16 11:19:53.000000000,160,73,868.000000,11,1
430585,-1,-1,1,1995.0,2025-01-05 12:00:00,2025-01-05 12:10:09.097836533,10,50,609.097837,12,4
430586,-1,-1,1,1995.0,2025-01-05 12:00:00,2025-01-05 12:10:09.097836533,10,50,609.097837,12,4


In [6]:
# **Define Features and Target**
X = df_combined[['Year_of_Birth', 'Gender', 'Origin_Id', 'Destination_Id', 'Start_Hour', 'Start_DayOfWeek']]
y = df_combined['Trip_Duration']

# Ensure correct data types
X = X.astype({
    "Year_of_Birth": np.float64,
    "Gender": np.int64,
    "Origin_Id": np.int64,
    "Destination_Id": np.int64,
    "Start_Hour": np.int32,
    "Start_DayOfWeek": np.int32
})

# Preprocessing pipeline
categorical_features = ['Gender']
numerical_features = ['Year_of_Birth', 'Origin_Id', 'Destination_Id', 'Start_Hour', 'Start_DayOfWeek']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Evaluate current model performance
y_pred_existing = model.predict(X_test)
mae_existing = mean_absolute_error(y_test, y_pred_existing)

# Define degradation threshold
THRESHOLD = 5.0  # Adjust based on acceptable performance drop


In [7]:
# If performance degrades, retrain
if mae_existing > THRESHOLD:
    print(f"Performance degraded (MAE: {mae_existing:.4f}). Retraining model...")

    mlflow.set_experiment("trip_duration_prediction")

    # **Generate timestamp for run name**
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    run_name = f"linear_regression_retrain_{timestamp}"

    with mlflow.start_run(run_name=run_name):
        # Train new model
        lr_pipeline.fit(X_train, y_train)
        y_pred_new = lr_pipeline.predict(X_test)

        # Compute new metrics
        mae_new = mean_absolute_error(y_test, y_pred_new)
        rmse_new = np.sqrt(mean_squared_error(y_test, y_pred_new))

        # Infer schema
        signature = infer_signature(X_train, y_pred_new)

        # Log new metrics
        mlflow.log_metric("mae", mae_new)
        mlflow.log_metric("rmse", rmse_new)

        # Save new model
        mlflow.sklearn.log_model(lr_pipeline, "linear_regression_model", signature=signature)

        print(f"New model retrained and logged (MAE: {mae_new:.4f}).")

    # Delete old input data to avoid duplicate training
    os.remove(new_data_file)
    print("Old input data deleted.")
else:
    print(f"No retraining needed. Current MAE: {mae_existing:.4f} is within acceptable range.")

Performance degraded (MAE: 286.4386). Retraining model...




New model retrained and logged (MAE: 286.5348).
🏃 View run linear_regression_retrain_20250402_000353 at: http://127.0.0.1:5000/#/experiments/507713236911629796/runs/0922d7eeb76643548fda2044973d6277
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/507713236911629796
Old input data deleted.
