In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.feature_extraction import DictVectorizer 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [15]:
pwd

'/home/juliandry/mlops-zoomcamp/03-training'

In [14]:
import mlflow

mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')


<Experiment: artifact_location='/home/juliandry/mlops-zoomcamp/03-training/mlruns/1', creation_time=1719481670513, experiment_id='1', last_update_time=1719481670513, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [16]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime-df.tpep_pickup_datetime   
    df['duration'] = df.duration.apply(lambda x: x.total_seconds()/60)

    df = df[(df.duration>=1)&(df.duration<=60)]

    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    df[categorical] = df[categorical].astype('str') 
    
    return df

In [17]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df_valid = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [18]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dict = df_train[categorical].to_dict(orient='records')
valid_dict = df_valid[categorical].to_dict(orient='records')

X_train = dv.fit_transform(train_dict)
X_valid = dv.transform(valid_dict)

In [19]:
target = ['duration']
y_train = df_train[target].values.ravel()
y_valid = df_valid[target].values.ravel()

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
y_pred_valid = lr.predict(X_valid)

In [21]:
print(mean_squared_error(y_train, y_pred, squared=False))
print(mean_squared_error(y_valid, y_pred_valid, squared=False))

7.6492624397080675
7.81181211389241




In [22]:
# plt.figure(figsize=(12,8))
# sns.kdeplot(y_pred_valid, label='prediction', fill=True)
# sns.kdeplot(y_valid, label='actual', fill=True)

# plt.legend()
# plt.show()

In [30]:
with mlflow.start_run():
    mlflow.set_tag("developer", 'jul')

    mlflow.log_param("train-data-path", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
    # mlflow.log_param("valid-data-path", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")
    
    alpha=0.01
    mlflow.log_param("valid-data-path", alpha)
    ls = Lasso(alpha)
    ls.fit(X_train, y_train)

    y_pred = ls.predict(X_train)

    y_pred_valid = ls.predict(X_valid)

    rmse_train = mean_squared_error(y_train, y_pred, squared=False)
    rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)

    mlflow.log_metric("rmse", rmse)

In [91]:
with open('model/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv,lr), f_out)