In [17]:
import os
import pickle
import click
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_extraction import DictVectorizer

import mlflow
import mlflow.sklearn

In [2]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")  

<Experiment: artifact_location='file:///C:/Users/rober/Desktop/MLOps/VSCode/Homework 3/artifacts/1', creation_time=1748465400930, experiment_id='1', last_update_time=1748465400930, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)

In [4]:
def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [5]:
def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv

In [6]:
def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "yellow"):
    # Load parquet files
    df_train = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet")
    )
    df_val = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-04.parquet")
    )
    df_test = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-05.parquet")
    )

    # Extract the target
    target = 'duration'
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    # Fit the DictVectorizer and preprocess data
    dv = DictVectorizer()
    X_train, dv = preprocess(df_train, dv, fit_dv=True)
    X_val, _ = preprocess(df_val, dv, fit_dv=False)
    X_test, _ = preprocess(df_test, dv, fit_dv=False)

    # Create dest_path folder unless it already exists
    os.makedirs(dest_path, exist_ok=True)

    # Save DictVectorizer and datasets
    dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
    dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
    dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))

In [7]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [8]:
def run_train(data_path: str):

    params = {
        'bootstrap':True,
        'ccp_alpha':0.0,
        'criterion': 'squared_error',
        'max_depth':10,
        'max_features':1.0,
        'max_leaf_nodes':None,
        'max_samples':None,
        'min_impurity_decrease':0.0,
        'min_samples_leaf':1,
        'min_samples_split':2,
        'min_weight_fraction_leaf':0.0,
        'monotonic_cst':None,
        'n_estimators':100,
        'n_jobs':None,
        'oob_score':False,
        'random_state':0,
        'verbose':0,
        'warm_start':False
    }

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    
    with mlflow.start_run():
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = root_mean_squared_error(y_val, y_pred)
        print(f"RMSE: {rmse}")

        # Log the model
        mlflow.sklearn.log_model(rf, "model")
        # Log the RMSE metric
        mlflow.log_metric("rmse", rmse)


In [14]:
df = pd.read_parquet('../data/yellow_tripdata_2023-03.parquet')

In [12]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3403766 entries, 0 to 3403765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [13]:
df_prep =  read_dataframe('../data/yellow_tripdata_2023-03.parquet')

In [15]:
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3316216 entries, 0 to 3403765
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           object        
 8   DOLocationID           object        
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [19]:
df_train = pd.read_pickle('../output/train.pkl')

In [16]:
from sklearn.linear_model import LinearRegression

In [24]:
X_train, y_train = load_pickle(os.path.join('../output', "train.pkl"))
X_val, y_val = load_pickle(os.path.join('../output', "val.pkl"))

In [25]:
model = LinearRegression()
model.fit(X_train, y_train)

In [27]:
print(model.intercept_)  # float → l'intercept (biais)

27.16020996868074


In [28]:
 # Log the model
mlflow.sklearn.log_model(model, "model")



<mlflow.models.model.ModelInfo at 0x1f4e0f9f2d0>