In [None]:
import os
import mlflow
import pandas as pd

In [None]:
!mlflow --version

mlflow, version 2.7.1


In [None]:
cwd = os.getcwd()

In [None]:


mlflow.set_tracking_uri('http://127.0.0.1:1234')

model_name = "GradientBoostingRegressor"
model_stage = "Staging"

model_uri = f"models:/{model_name}/{model_stage}"
mlflow_model = mlflow.pyfunc.load_model(model_uri=model_uri)

print(mlflow_model)


mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.sklearn
  run_id: b33f0afd99c449d699ad35a803e71e6d



In [None]:
input_data_1 = pd.DataFrame({
 'trip_miles': [2.71],
 'trip_time': [796.0],
 'access_a_ride_flag': [' '],
 'request_datetime_hour': [11],
 'request_datetime_day': ['Friday'],
 'request_datetime_month': ['January'],
 'duration_minutes': [13.266666666666667],
 'wait_time_minutes': [1.3833333333333333],
 'service_time_minutes': [16.366666666666667],
 'on_scene_datetime_hour': [11],
 'on_scene_datetime_day': ['Friday'],
 'on_scene_datetime_month': ['January'],
 'pickup_datetime_hour': [11],
 'pickup_datetime_day': ['Friday'],
 'pickup_datetime_month': ['January'],
 'dropoff_datetime_hour': [11],
 'dropoff_datetime_day': ['Friday'],
 'dropoff_datetime_month': ['January'],
 'average_speed': [0.20427135678391958]
})

input_data_2 = pd.DataFrame({
 'trip_miles': [3.5],
 'trip_time': [850.0],
 'access_a_ride_flag': ['Y'],
 'request_datetime_hour': [14],
 'request_datetime_day': ['Monday'],
 'request_datetime_month': ['February'],
 'duration_minutes': [14.166666666666666],
 'wait_time_minutes': [1.6],
 'service_time_minutes': [17.5],
 'on_scene_datetime_hour': [14],
 'on_scene_datetime_day': ['Monday'],
 'on_scene_datetime_month': ['February'],
 'pickup_datetime_hour': [14],
 'pickup_datetime_day': ['Monday'],
 'pickup_datetime_month': ['February'],
 'dropoff_datetime_hour': [15],
 'dropoff_datetime_day': ['Monday'],
 'dropoff_datetime_month': ['February'],
 'average_speed': [0.24705882352941178]
})


In [None]:
import os
import yaml
import pickle
import pandas as pd
import logging

import warnings
warnings.simplefilter(action='ignore', category=Warning)


class Files:
    """Utility class for reading and loading files."""

    @staticmethod
    def read_yaml(file_path):
        """Load YAML file."""
        with open(file_path, 'r') as file:
            return yaml.safe_load(file)

    @staticmethod
    def load_pickle(pickle_path):
        """Load Pickle file."""
        with open(pickle_path, 'rb') as pickle_file:
            return pickle.load(pickle_file)


class ModelPredictor:
    """Class for loading the model and making predictions."""

    def __init__(self, _, X_scaler_path, y_scaler_path):
        """Initialize and load the model and scalers."""
        self.model = mlflow_model
        self.X_scaler = Files.load_pickle(X_scaler_path)
        self.y_scaler = Files.load_pickle(y_scaler_path)

    def predict(self, data):
        """Predict the output for given data."""
        data[continuous_cols] = self.X_scaler.transform(data[continuous_cols])
        prediction = self.model.predict(data)
        inverse_transformed_y = self.y_scaler.inverse_transform(prediction.reshape(-1, 1))
        return inverse_transformed_y


def map_data_to_df(input_data):
    """Map input data to a DataFrame in the expected format."""
    # Initialize DataFrame with zeros
    df_mapped = pd.DataFrame(columns=keys_list, index=[0]).fillna(0)

    # Map values
    for key, value in input_data.items():
        if key in df_mapped.columns:
            df_mapped[key] = value

    # One-hot encode days
    days_columns = [
        'request_datetime_day', 'on_scene_datetime_day',
        'pickup_datetime_day', 'dropoff_datetime_day'
    ]
    for col in days_columns:
        if col in input_data:
            column_name = f"{col}_{input_data[col]}"
            if column_name in df_mapped.columns:
                df_mapped[column_name] = 1

    # Set data types
    for col in continuous_cols:
        if col in df_mapped.columns:
            df_mapped[col] = df_mapped[col].astype(float)
    for col in categorical_cols:
        if col in df_mapped.columns:
            df_mapped[col] = df_mapped[col].astype(str) if "day" in col else df_mapped[col].astype(int)

    return df_mapped




def perform_prediction(data):
    """Main function to load model and make predictions."""
    try:
        config = Files.read_yaml('/mnt/hgfs/DS/NYC MLOPS/parameters.yaml')
        model_file_path = os.path.join(os.path.dirname(cwd),config['prediction_app']['model'])
        X_scaler_path = os.path.join(os.path.dirname(cwd),config['prediction_app']['scaler'], "X_scaler.pkl")
        y_scaler_path= os.path.join(os.path.dirname(cwd),config['prediction_app']['scaler'], "y_scaler.pkl")

        predictor = ModelPredictor(model_file_path, X_scaler_path, y_scaler_path)
        prediction = predictor.predict(data)
        

        logging.info(f"Prediction result: {prediction}")
        return prediction
    except Exception as e:
        logging.error(f"Error in main_prediction: {e}")
        raise


#> Mapping

keys_list = [
    'trip_miles', 'trip_time', 'duration_minutes', 'wait_time_minutes', 'service_time_minutes', 'average_speed', 
    'request_datetime_hour', 'on_scene_datetime_hour', 'pickup_datetime_hour', 'dropoff_datetime_hour',
    'request_datetime_day_Friday', 'request_datetime_day_Monday', 
    'request_datetime_day_Saturday', 'request_datetime_day_Sunday', 'request_datetime_day_Thursday', 
    'request_datetime_day_Tuesday', 'request_datetime_day_Wednesday', 'on_scene_datetime_day_Friday', 
    'on_scene_datetime_day_Monday', 'on_scene_datetime_day_Saturday', 'on_scene_datetime_day_Sunday', 
    'on_scene_datetime_day_Thursday', 'on_scene_datetime_day_Tuesday', 'on_scene_datetime_day_Wednesday', 
    'pickup_datetime_day_Friday', 'pickup_datetime_day_Monday', 'pickup_datetime_day_Saturday', 
    'pickup_datetime_day_Sunday', 'pickup_datetime_day_Thursday', 'pickup_datetime_day_Tuesday', 
    'pickup_datetime_day_Wednesday', 'dropoff_datetime_day_Friday', 'dropoff_datetime_day_Monday', 
    'dropoff_datetime_day_Saturday', 'dropoff_datetime_day_Sunday', 'dropoff_datetime_day_Thursday', 
    'dropoff_datetime_day_Tuesday', 'dropoff_datetime_day_Wednesday'
]

# Columns
continuous_cols = [
    'trip_miles', 'trip_time', 'duration_minutes',
    'wait_time_minutes', 'service_time_minutes', 'average_speed'
]

categorical_cols = [
    'request_datetime_hour', 'request_datetime_day', 'request_datetime_month',
    'on_scene_datetime_hour', 'on_scene_datetime_day', 'on_scene_datetime_month',
    'pickup_datetime_hour', 'pickup_datetime_day', 'pickup_datetime_month',
    'dropoff_datetime_hour', 'dropoff_datetime_day', 'dropoff_datetime_month'
]

In [None]:
df_mapped = map_data_to_df(input_data_1)
pred=perform_prediction(df_mapped)
print(pred)

[[14.36490738]]


In [None]:
df_mapped = map_data_to_df(input_data_2)
pred=perform_prediction(df_mapped)
print(pred)

[[16.4006366]]
