In [1]:
!python -V

Python 3.12.3


In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
import pickle

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [8]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment-mlops")

<Experiment: artifact_location='/workspaces/MLops/experiment_tracking/mlruns/2', creation_time=1768645123600, experiment_id='2', last_update_time=1768645123600, lifecycle_stage='active', name='nyc-taxi-experiment-mlops', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [8]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df
    

In [9]:
df_train = read_dataframe('/workspaces/MLops/experiment_tracking/green_tripdata_2021-01.parquet')
df_val = read_dataframe('/workspaces/MLops/experiment_tracking/green_tripdata_2021-02.parquet')

In [10]:
categorical = ['duration'] 
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [11]:
from sklearn.metrics import mean_squared_error
y_train = df_train['duration'].values
y_val = df_val['duration'].values
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

print(f"RMSE: {rmse:.2f} minutes")

RMSE: 0.00 minutes


In [12]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [13]:
import numpy as np
import mlflow

with mlflow.start_run():
    mlflow.set_tag("developer", "Tom")
    mlflow.log_param("train_data_path", "green_tripdata_2021-01.parquet")
    mlflow.log_param("valid_data_path", "green_tripdata_2021-02.parquet")
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    
    # Calculate RMSE without squared parameter
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)
    
    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")
    
    print(f"RMSE: {rmse:.2f} minutes")

RMSE: 0.00 minutes


In [14]:
import xgboost as xgb

In [15]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  import pkg_resources


In [16]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [17]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [18]:
import numpy as np
import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope
from sklearn.metrics import mean_squared_error


train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)


def objective(params):
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10,
            verbose_eval=False
        )
        
        y_pred = booster.predict(valid)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        mlflow.log_metric("rmse", rmse)
    
    return {'loss': rmse, 'status': STATUS_OK}


search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}


with mlflow.start_run():
    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=50,
        trials=Trials()
    )
    
    print(f"Best parameters: {best_result}")
    

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [00:51<00:00,  1.02s/trial, best loss: 0.16345258310222]   
Best parameters: {'learning_rate': np.float64(0.07384162503406404), 'max_depth': np.float64(5.0), 'min_child_weight': np.float64(1.5886767669774438), 'reg_alpha': np.float64(0.03042435402875251), 'reg_lambda': np.float64(0.035197802210846806)}


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("/workspaces/MLops/experiment_tracking/models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=np.sqrt())
        mlflow.log_metric("rmse", rmse)
        

In [20]:
import numpy as np
import mlflow
import pickle
import os

os.makedirs("models", exist_ok=True)


with open("models/preprocessor.b", "wb") as f_out:
    pickle.dump(dv, f_out)

with mlflow.start_run():
    mlflow.set_tag("developer", "Tom")
    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
    
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
    
    mlmodel = model_class()
    mlmodel.fit(X_train, y_train)
    
    y_pred = mlmodel.predict(X_val)
    
   
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    mlflow.log_metric("rmse", rmse)
    
    print(f"RMSE: {rmse:.4f}")
    mlflow.log_artifact(local_path="/workspaces/MLops/experiment_tracking/models/lin_reg.bin",artifact_path="models_pickle")

RMSE: 0.0020


In [21]:
params={
    'bootstrap': True,
                 
'ccp_alpha':0.0,

'criterion':'squared_error',

'max_depth':None,

'max_features':1.0,

'max_leaf_nodes':None,

'max_samples':None,

'min_impurity_decrease':0.0,

'min_samples_leaf':1,

'min_samples_split':2,

'min_weight_fraction_leaf':0.0,

'monotonic_cst':None,
'n_estimators':100,
'n_jobs':None,
'oob_score':False,
'random_state':None,
'train-data-path':'./data/green_tripdata_2021-01.csv',
'valid-data-path':'./data/green_tripdata_2021-02.csv',
'verbose':0,
'warm_start':False
}

mlflow.xgboost.autolog()
        
        
booster = xgb.train(
params=params,
dtrain=train,
num_boost_round=100,
evals=[(valid, 'validation')],
early_stopping_rounds=10,
verbose_eval=False
        )
        


2026/01/17 18:38:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e3f8f9553e5447f2ae7292cb5a632d2d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow
Parameters: { "bootstrap", "ccp_alpha", "criterion", "max_features", "min_impurity_decrease", "min_samples_leaf", "min_samples_split", "min_weight_fraction_leaf", "n_estimators", "oob_score", "train-data-path", "valid-data-path", "verbose", "warm_start" } are not used.

  self.starting_round = model.num_boosted_rounds()


In [22]:
with mlflow.start_run():
    best_params={
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
    }
    mlflow.log_params(best_params)
    booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10,
            verbose_eval=False
        )
        
    y_pred = booster.predict(valid)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
    mlflow.log_metric("rmse", rmse)
    print({'loss': rmse, 'status': STATUS_OK})
  
        
       
    mlflow.log_artifact("models/preprocessor.b",artifact_path="preprocessor")

mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")


Parameters: { "bootstrap", "ccp_alpha", "criterion", "max_features", "min_impurity_decrease", "min_samples_leaf", "min_samples_split", "min_weight_fraction_leaf", "n_estimators", "oob_score", "train-data-path", "valid-data-path", "verbose", "warm_start" } are not used.

  self.starting_round = model.num_boosted_rounds()


{'loss': np.float64(0.16484454502637327), 'status': 'ok'}


<mlflow.models.model.ModelInfo at 0x7a5a9e9219d0>

In [29]:


mlflow.set_tracking_uri("http://127.0.0.1:5000")

logged_model = mlflow.pyfunc.load_model(
    "models:/nyc-taxi-regressor@staging"
)


In [30]:
logged_model

mlflow.pyfunc.loaded_model:
  artifact_path: /workspaces/MLops/experiment_tracking/mlruns/2/models/m-310fa30d777c46c09d1db7d344310554/artifacts
  flavor: mlflow.xgboost

In [34]:
xgboost_model = mlflow.xgboost.load_model(
    "models:/nyc-taxi-regressor@staging"
)


In [35]:
y_pred=xgboost_model.predict(valid)

In [36]:
y_pred[:5]

array([17.771467 ,  6.5437827, 15.257699 , 18.020634 ,  8.960706 ],
      dtype=float32)

model registry

In [10]:
import mlflow
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

client = MlflowClient()


In [11]:
experiments = client.search_experiments()

for exp in experiments:
    print(exp.experiment_id, exp.name)



3 regressor
2 nyc-taxi-experiment-mlops
1 nyc-taxi-experiment
0 Default


In [12]:
from mlflow.entities import ViewType
runs=client.search_runs(
    experiment_ids='2',
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [13]:
for run in runs:
    print(f"run id:{run.info.run_id},rmse:{run.data.metrics['rmse']:.4f}")

run id:cbdafb1c1ee041e5b7c2c3f154afb0a9,rmse:0.0000
run id:232f394774fb4f2887841d3b3489f2f0,rmse:0.0000
run id:692ecdbeb3d44ecb9169bf92abbdd6c5,rmse:0.0020
run id:7764fc24b1fa443fab2f8b2e705807a5,rmse:0.0020
run id:33160cef36ae4b36bb13c5c3cee2c618,rmse:0.0020


In [14]:
run_id="33160cef36ae4b36bb13c5c3cee2c618"
model_uri=f"runs:/{run_id}/model"
mlflow.register_model(model_uri , name="nyc-taxi")

Registered model 'nyc-taxi' already exists. Creating a new version of this model...


Created version '2' of model 'nyc-taxi'.


<ModelVersion: aliases=[], creation_timestamp=1768727748119, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1768727748119, metrics=None, model_id=None, name='nyc-taxi', params=None, run_id='33160cef36ae4b36bb13c5c3cee2c618', run_link=None, source='models:/m-3f3b5c1aa9d14325b96d76681c0a9266', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [None]:
mv = client.get_model_version_by_alias(
    name="nyc-taxi",
    alias="staging"
)

print(f"version {mv.version}, model_id {mv.model_id}")


In [21]:
model_version=2
new_stage="staging"
client.transition_model_version_stage(
    name='nyc-taxi',
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1768727748119, current_stage='Staging', deployment_job_state=None, description=None, last_updated_timestamp=1768728107617, metrics=None, model_id=None, name='nyc-taxi', params=None, run_id='33160cef36ae4b36bb13c5c3cee2c618', run_link=None, source='models:/m-3f3b5c1aa9d14325b96d76681c0a9266', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [22]:

client.set_registered_model_alias(
    name="nyc-taxi",
    alias="production",
    version=2
)


In [24]:
from datetime import datetime

date=datetime.today().date()

client.update_model_version(
    name="nyc-taxi",
    version=model_version,
    description=f"the model version{model_version} was transitioned to {new_stage}on {date}"
)

<ModelVersion: aliases=['production'], creation_timestamp=1768727748119, current_stage='Staging', deployment_job_state=None, description='the model version2 was transitioned to stagingon 2026-01-18', last_updated_timestamp=1768728333716, metrics=None, model_id=None, name='nyc-taxi', params=None, run_id='33160cef36ae4b36bb13c5c3cee2c618', run_link=None, source='models:/m-3f3b5c1aa9d14325b96d76681c0a9266', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [25]:
from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [26]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [34]:
!pip install pyarrow



In [62]:


import pandas as pd

df = pd.read_parquet("/workspaces/MLops/experiment_tracking/green_tripdata_2021-02.parquet", engine="pyarrow")


In [63]:
client.download_artifacts(run_id=run_id, path='preprocessor',dst_path='.')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/workspaces/MLops/experiment_tracking/preprocessor'

In [64]:
import pickle 
with open("/workspaces/MLops/experiment_tracking/preprocessor/preprocessor.b","rb" )as f_in:
    dv=pickle.load(f_in)

In [65]:
print(df.columns.tolist())


['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge']


In [66]:
df["mta_tax"] = pd.to_numeric(df["mta_tax"], errors="coerce")


In [67]:
df["mta_tax"].dtype
df["mta_tax"].unique()[:10]


array([ 0.5,  0. , -0.5])

In [68]:
import numpy as np
df["mta_tax"] = (
    df["mta_tax"]
    .astype(str)
    .str.strip()
    .replace("", np.nan)
    .pipe(pd.to_numeric, errors="coerce")
)


In [69]:
print(df["mta_tax"].dtype)


float64


In [70]:
df["total"] = df["extra"] + df["mta_tax"]


In [71]:
numeric_cols = [
    
    "mta_tax",
    "fare_amount",
    "extra",
    "tip_amount"
]

df[numeric_cols] = df[numeric_cols].apply(
    pd.to_numeric, errors="coerce"
)


In [None]:
X_test = preprocess(df, dv)

In [75]:
target = "PULocationID"
y_test = df[target].values


In [88]:

preds = model.predict(X_test)


MlflowException: Failed to enforce schema of data '         VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
624715          2  2021-03-12 09:21:46   2021-03-12 09:33:10              2.0   
1462627         2  2021-03-26 11:59:36   2021-03-26 12:05:37              1.0   
553107          2  2021-03-11 06:44:41   2021-03-11 06:56:43              1.0   
874215          2  2021-03-16 15:26:31   2021-03-16 16:04:09              1.0   
1226844         2  2021-03-22 13:13:08   2021-03-22 13:21:26              1.0   
...           ...                  ...                   ...              ...   
128904          2  2021-03-03 08:21:14   2021-03-03 08:25:20              1.0   
1640804         1  2021-03-29 13:05:46   2021-03-29 13:24:17              2.0   
1336521         1  2021-03-24 11:57:32   2021-03-24 12:27:01              1.0   
979486          1  2021-03-18 10:46:54   2021-03-18 11:00:43              1.0   
1472259         1  2021-03-26 13:16:23   2021-03-26 13:30:13              1.0   

         trip_distance  RatecodeID store_and_fwd_flag  DOLocationID  \
624715            2.00         1.0                  N           161   
1462627           0.83         1.0                  N           246   
553107            3.58         1.0                  N           161   
874215           19.50         1.0                  N            42   
1226844           1.70         1.0                  N           161   
...                ...         ...                ...           ...   
128904            0.87         1.0                  N           163   
1640804           3.80         1.0                  N            68   
1336521           4.20         1.0                  N           114   
979486            3.00         1.0                  N            90   
1472259           2.20         1.0                  N           246   

         payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \
624715              1          9.5    0.0      0.5        1.70          0.00   
1462627             1          5.5    0.0      0.5        2.20          0.00   
553107              1         12.5    0.0      0.5        1.00          0.00   
874215              2         53.0    0.0      0.5        0.00          6.12   
1226844             2          7.5    0.0      0.5        0.00          0.00   
...               ...          ...    ...      ...         ...           ...   
128904              2          5.0    0.0      0.5        0.00          0.00   
1640804             1         15.5    2.5      0.5        4.70          0.00   
1336521             1         20.0    2.5      0.5        4.65          0.00   
979486              1         12.0    2.5      0.5        3.05          0.00   
1472259             1         10.5    2.5      0.5        2.75          0.00   

         improvement_surcharge  total_amount  congestion_surcharge  \
624715                     0.3         14.50                   2.5   
1462627                    0.3         11.00                   2.5   
553107                     0.3         16.80                   2.5   
874215                     0.3         59.92                   0.0   
1226844                    0.3         10.80                   2.5   
...                        ...           ...                   ...   
128904                     0.3          8.30                   2.5   
1640804                    0.3         23.50                   2.5   
1336521                    0.3         27.95                   2.5   
979486                     0.3         18.35                   2.5   
1472259                    0.3         16.55                   2.5   

         airport_fee  
624715           NaN  
1462627          NaN  
553107           NaN  
874215           NaN  
1226844          NaN  
...              ...  
128904           NaN  
1640804          0.0  
1336521          NaN  
979486           NaN  
1472259          NaN  

[385031 rows x 18 columns]' with schema '[Tensor('float64', (-1, 2))]'. Error: This model contains a model signature with an unnamed input. Since the input data is a pandas DataFrame containing multiple columns, the input shape must be of the structure (-1, number_of_dataframe_columns). Instead, the input DataFrame passed had 18 columns and an input shape of (-1, 2) with all values within the DataFrame of scalar type. Please adjust the passed in DataFrame to match the expected structure

In [81]:
from sklearn.model_selection import train_test_split



df = pd.read_parquet("/workspaces/MLops/experiment_tracking/data/yellow_tripdata_2021-03.parquet")

X = df.drop("PULocationID", axis=1)
y = df["PULocationID"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [92]:
%time test_model(name="nyc-taxi", stage="staging", X_test=X_test, y_test=y_test)

CPU times: user 54.6 ms, sys: 37.6 ms, total: 92.2 ms
Wall time: 94.1 ms


MlflowException: Failed to enforce schema of data '         VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
624715          2  2021-03-12 09:21:46   2021-03-12 09:33:10              2.0   
1462627         2  2021-03-26 11:59:36   2021-03-26 12:05:37              1.0   
553107          2  2021-03-11 06:44:41   2021-03-11 06:56:43              1.0   
874215          2  2021-03-16 15:26:31   2021-03-16 16:04:09              1.0   
1226844         2  2021-03-22 13:13:08   2021-03-22 13:21:26              1.0   
...           ...                  ...                   ...              ...   
128904          2  2021-03-03 08:21:14   2021-03-03 08:25:20              1.0   
1640804         1  2021-03-29 13:05:46   2021-03-29 13:24:17              2.0   
1336521         1  2021-03-24 11:57:32   2021-03-24 12:27:01              1.0   
979486          1  2021-03-18 10:46:54   2021-03-18 11:00:43              1.0   
1472259         1  2021-03-26 13:16:23   2021-03-26 13:30:13              1.0   

         trip_distance  RatecodeID store_and_fwd_flag  DOLocationID  \
624715            2.00         1.0                  N           161   
1462627           0.83         1.0                  N           246   
553107            3.58         1.0                  N           161   
874215           19.50         1.0                  N            42   
1226844           1.70         1.0                  N           161   
...                ...         ...                ...           ...   
128904            0.87         1.0                  N           163   
1640804           3.80         1.0                  N            68   
1336521           4.20         1.0                  N           114   
979486            3.00         1.0                  N            90   
1472259           2.20         1.0                  N           246   

         payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \
624715              1          9.5    0.0      0.5        1.70          0.00   
1462627             1          5.5    0.0      0.5        2.20          0.00   
553107              1         12.5    0.0      0.5        1.00          0.00   
874215              2         53.0    0.0      0.5        0.00          6.12   
1226844             2          7.5    0.0      0.5        0.00          0.00   
...               ...          ...    ...      ...         ...           ...   
128904              2          5.0    0.0      0.5        0.00          0.00   
1640804             1         15.5    2.5      0.5        4.70          0.00   
1336521             1         20.0    2.5      0.5        4.65          0.00   
979486              1         12.0    2.5      0.5        3.05          0.00   
1472259             1         10.5    2.5      0.5        2.75          0.00   

         improvement_surcharge  total_amount  congestion_surcharge  \
624715                     0.3         14.50                   2.5   
1462627                    0.3         11.00                   2.5   
553107                     0.3         16.80                   2.5   
874215                     0.3         59.92                   0.0   
1226844                    0.3         10.80                   2.5   
...                        ...           ...                   ...   
128904                     0.3          8.30                   2.5   
1640804                    0.3         23.50                   2.5   
1336521                    0.3         27.95                   2.5   
979486                     0.3         18.35                   2.5   
1472259                    0.3         16.55                   2.5   

         airport_fee  
624715           NaN  
1462627          NaN  
553107           NaN  
874215           NaN  
1226844          NaN  
...              ...  
128904           NaN  
1640804          0.0  
1336521          NaN  
979486           NaN  
1472259          NaN  

[385031 rows x 18 columns]' with schema '[Tensor('float64', (-1, 2))]'. Error: This model contains a model signature with an unnamed input. Since the input data is a pandas DataFrame containing multiple columns, the input shape must be of the structure (-1, number_of_dataframe_columns). Instead, the input DataFrame passed had 18 columns and an input shape of (-1, 2) with all values within the DataFrame of scalar type. Please adjust the passed in DataFrame to match the expected structure

In [90]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

model_name = "nyc-taxi"
model_version = 2

client.set_registered_model_alias(
    name=model_name,
    alias="production",
    version=model_version
)


In [91]:
import mlflow.pyfunc

model = mlflow.pyfunc.load_model(
    model_uri="models:/nyc-taxi@production"
)
