In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle
import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

mlflow.set_tracking_url("http://127.0.0.1:5000")
mlflow.set_experimen("green-taxi-duration")

AttributeError: module 'mlflow' has no attribute 'set_tracking_url'

In [None]:
df= pd.read_parquet("./data/yellow_tripdata_2023-01.parquet")
df.head()

: 

In [None]:
df.info()

: 

There are 19 columns

In [None]:
# deriving trip duration in minutes
df['duration']= df['tpep_dropoff_datetime']-df['tpep_pickup_datetime']
df['duration']=df['duration'].apply(lambda x: x.total_seconds()/60)

: 

In [None]:
# standard deviation of trip duration
df['duration'].std()

: 

In [None]:
#Filter dataset to contain only trips with duration between 1 and 60 minutes
new_df=df[(df['duration']>= 1) & (df['duration']<= 60)]

# fraction of trips within selected duration
len(new_df)/len(df)

: 

In [None]:
variables=new_df[['PULocationID' ,'DOLocationID']].astype(str)
train_dict=variables.to_dict(orient='records')

: 

In [None]:
dv=DictVectorizer()
x_train=dv.fit_transform(train_dict)
y_train=new_df['duration']

: 

In [None]:
x_train.shape

: 

In [None]:
mlflow.set_tag("developer","Jane")
mlflow.log_param("train_data_path", "./data/green_tripdata_2023-01.csv")
mlflow.log_param("validation_data_path", "./data/green_tripdata_2023-01.csv")


model= LinearRegression()
model.fit(x_train, y_train)
y_pred=model.predict(x_train)



: 

In [None]:
with open('model/lin_reg.bin', 'wb') as f_out:
    pickle.dump((model,dv), f_out)

: 

In [None]:
import math
#root mean squared error= root of mean squared error
rmse=math.sqrt(mean_squared_error(y_train, y_pred))
rmse

: 

In [None]:
df2= pd.read_parquet("./data/yellow_tripdata_2023-02.parquet")

: 

In [None]:
# deriving trip duration in minutes
df2['duration']= df2['tpep_dropoff_datetime']-df2['tpep_pickup_datetime']
df2['duration']=df2['duration'].apply(lambda x: x.total_seconds()/60)

: 

In [None]:
#Filter dataset to contain only trips with duration between 1 and 60 minutes
new_df2=df2[(df2['duration']>= 1) & (df2['duration']<= 60)]


: 

In [None]:
variables=new_df2[['PULocationID' ,'DOLocationID']].astype(str)
test_dict=variables.to_dict(orient='records')

: 

In [None]:

x_test=dv.transform(test_dict)
x_test.shape

: 

In [None]:
y_test=new_df2['duration']
y_pred=model.predict(x_test)

#root mean square on february data
rmse=math.sqrt(mean_squared_error(y_test, y_pred))
mlflow.log_metric("rmse", rmse)
mlflow.log_artifact(local_path="model/lin_reg.bin", artifact_path="models_pickle/")
rmse

: 

In [None]:
search_space={
    'max_depth':scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate':hp.loguniform('learning_rate',-3, 0),
    'min_child_weight':hp.loguniform('min_child_weight',-1, 3),
    'objective':'reg:linear',
    'reg_alpha':hp.loguniform('min_child_weight',-5, 1),
    'reg_lambda':hp.loguniform('min_child_weight',-6, 1),
    'seed':42
}

best_result=fmin(objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

: 

In [None]:
train= xgb.DMatrix(x_train, label=y_train)
valid= xgb.DMatrix(x_test, label=y_test)

with mlflow.start_run():
    best_params={
        'learning_rate':0.09585355,
        'max_depth':30,
        'min_child_weight':1.06059705,
        'objective':'reg:linear',
        'reg_alpha':0.018060244,
        'reg_lambda':0.016587,
        'seed':42
    }
    mlflow.log_params(best_params)

    booster=xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_round=50
    )

    y_pred=booster.predict(valid)
    rmse=mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric('rmse', rmse)

    with open('models/preprocessor.b', 'wb') as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

: 