In [68]:
!python -V

Python 3.9.12


In [69]:
import mlflow
import pickle
import zipfile
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from typing import List
from hyperopt.pyll import scope
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [70]:
def read_data(url: str):
    """
    Capital Bikeshare datasets are zipped
    We need to download then extract the csv
    """
    zip_path = url.split('/')[-1] 
    file_name = zip_path.split('.')[0] + '.csv'

    req = requests.get(url)

    with open(zip_path, 'wb') as f_out:
        f_out.write(req.content)

    with zipfile.ZipFile(zip_path) as z:
        with z.open(file_name) as f:
            df = pd.read_csv(f, parse_dates=True)
            
    categorical_cols = ['rideable_type', 'start_station_id', 'end_station_id']
    date_cols = ['started_at', 'ended_at']
    
    df[categorical_cols] = df[categorical_cols].astype(str)
    df[date_cols] = df[date_cols].apply(pd.to_datetime, format='%Y/%m/%d %H:%M:%S')
    
    df['duration'] = df['ended_at'] - df['started_at']
    df['duration'] = df['duration'].apply(lambda x: round(x.total_seconds() / 60, 0))
    df['start_end'] = df['start_station_id'] + '_' + df['end_station_id']

    df = df[df['duration'] <= 120]

    categorical_cols = ['rideable_type', 'start_end']
    target = 'duration'
    
    return df, categorical_cols, target

In [72]:
df, categorical_cols, target = read_data('https://s3.amazonaws.com/capitalbikeshare-data/202204-capitalbikeshare-tripdata.zip')

In [73]:
def create_train_val_sets(df: pd.DataFrame, categorical_cols: List, target: str):
    dv = DictVectorizer()
    dicts = df[categorical_cols].to_dict(orient='records')
    
    x = dv.fit_transform(dicts)
    y = df[target].values

    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)
    print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)
    print(dv)

    return x_train, x_val, y_train, y_val, dv

In [75]:
# categorical_cols = ['rideable_type', 'start_end']
# target = 'duration'
x_train, x_val, y_train, y_val, dv = create_train_val_sets(df, categorical_cols, target)

(243756, 54362) (60939, 54362) (243756,) (60939,)
DictVectorizer()


In [76]:
lr = Ridge().fit(x_train, y_train)
y_pred = lr.predict(x_val)
mean_squared_error(y_val, y_pred, squared=False)

14.405431269954295

In [77]:
with open('ridge.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [78]:
MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [79]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment('bikeshare-ride-duration-prediction')

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='bikeshare-ride-duration-prediction', tags={}>

In [80]:
with mlflow.start_run():
    categorical_cols = ['rideable_type', 'start_end']
    target = 'duration'
    x_train, x_val, y_train, y_val, dv = create_train_val_sets(df, categorical_cols, target)

    params = {
        'alpha': 1,
        'random_state': 0
    }
    mlflow.log_params(params)

    lr = Ridge(**params).fit(x_train, y_train)
    y_pred = lr.predict(x_val)
    mlflow.log_metric('mean_squared_error', mean_squared_error(y_val, y_pred, squared=False))

    mlflow.sklearn.log_model(lr, artifact_path='models')
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

(243756, 54362) (60939, 54362) (243756,) (60939,)
DictVectorizer()
default artifacts URI: './mlruns/1/aee2c1513c1744329d41da48772c60bf/artifacts'


In [83]:
def run(data_path, num_trials):
    
    df, categorical_cols, target = read_data(data_path)
    x_train, x_val, y_train, y_val, dv = create_train_val_sets(df, categorical_cols, target)


    def objective(params):

        # rf = RandomForestRegressor(**params)
        # rf.fit(x_train, y_train)
        # y_pred = rf.predict(x_val)
        # rmse = mean_squared_error(y_val, y_pred, squared=False)

        lr = Ridge(**params)
        lr.fit(x_train, y_train)
        y_pred = lr.predict(x_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)

        return {'loss': rmse, 'status': STATUS_OK}


    # search_space = {
    #     'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
    #     'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
    #     'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    #     'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    #     'random_state': 0
    # }

    search_space = {
        'alpha': scope.int(hp.uniform('alpha', 0.1, 1))
    }

    rstate = np.random.default_rng(0)  # for reproducible results
    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )

    return best_result

In [84]:
best_result = run(data_path='https://s3.amazonaws.com/capitalbikeshare-data/202204-capitalbikeshare-tripdata.zip', num_trials=1)

(243756, 54362) (60939, 54362) (243756,) (60939,)
DictVectorizer()
100%|██████████| 1/1 [00:00<00:00,  1.31trial/s, best loss: 14.5391502413888]
