In [1]:
import pandas as pd 
import numpy as np 
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import tqdm as notebook_tqdm

In [2]:
df = pd.read_parquet('/home/jagac/projects/taxi-tip-mlapp/Research/yellow_tripdata_2023-04.parquet')
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee'],
      dtype='object')

In [3]:
df['tipped'] = (df['tip_amount'] > 0).astype("int")
df['tipped'].value_counts()

tipped
1    2524116
0     764134
Name: count, dtype: int64

In [4]:
df['trip_time'] = (df.tpep_pickup_datetime - df.tpep_dropoff_datetime).astype('timedelta64[s]') / np.timedelta64(1, "s")
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis= 1)

In [5]:
one_hot_enc = OneHotEncoder()
arr = one_hot_enc.fit_transform(df[['store_and_fwd_flag']])
store_and_fwd_flag = pd.DataFrame(arr, columns=['store_and_fwd_flag_ohe'])

In [6]:
store_and_fwd_flag

Unnamed: 0,store_and_fwd_flag_ohe
0,"(0, 0)\t1.0"
1,"(0, 0)\t1.0"
2,"(0, 0)\t1.0"
3,"(0, 0)\t1.0"
4,"(0, 0)\t1.0"
...,...
3288245,"(0, 2)\t1.0"
3288246,"(0, 2)\t1.0"
3288247,"(0, 2)\t1.0"
3288248,"(0, 2)\t1.0"


In [7]:
df_merge = pd.merge(df, store_and_fwd_flag, left_index=True, right_index=True)
df_merge = df_merge.reset_index(drop=True)
df_merge = df_merge.drop('store_and_fwd_flag', axis = 1)
df_merge

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,tipped,trip_time,store_and_fwd_flag_ohe
0,1,2.0,4.90,1.0,48,223,1,28.90,3.5,0.5,6.00,0.00,1.0,39.90,2.5,0.00,1,-1812.0,"(0, 0)\t1.0"
1,2,1.0,21.89,2.0,132,43,2,70.00,0.0,0.5,0.00,6.55,1.0,81.80,2.5,1.25,0,-3355.0,"(0, 0)\t1.0"
2,1,2.0,1.30,1.0,148,113,1,11.40,3.5,0.5,2.00,0.00,1.0,18.40,2.5,0.00,1,-652.0,"(0, 0)\t1.0"
3,1,1.0,1.50,1.0,249,79,1,10.00,3.5,0.5,1.00,0.00,1.0,16.00,2.5,0.00,1,-490.0,"(0, 0)\t1.0"
4,2,2.0,1.49,1.0,158,246,1,11.40,1.0,0.5,1.00,0.00,1.0,17.40,2.5,0.00,1,-616.0,"(0, 0)\t1.0"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3288245,2,,2.02,,246,79,0,11.41,0.0,0.5,3.08,0.00,1.0,18.49,,,1,-952.0,"(0, 2)\t1.0"
3288246,2,,2.36,,114,68,0,13.19,0.0,0.5,0.00,0.00,1.0,17.19,,,0,-600.0,"(0, 2)\t1.0"
3288247,2,,4.64,,137,74,0,17.90,0.0,0.5,0.00,0.00,1.0,21.90,,,0,-1130.0,"(0, 2)\t1.0"
3288248,1,,0.00,,162,151,0,16.00,1.0,0.5,4.20,0.00,1.0,27.20,,,1,-860.0,"(0, 2)\t1.0"


In [8]:
y = df_merge.tipped
X = df_merge.drop('tipped', axis=1)
X = df_merge.drop('store_and_fwd_flag_ohe', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [9]:
oversample = RandomOverSampler(sampling_strategy="all")
X_over, y_over = oversample.fit_resample(X_train, y_train)

print(X_train.shape)
print(y_train.shape)
print(X_over.shape)
print(y_over.shape)

: 

: 

In [None]:
import mlflow
from pathlib import Path

# Set tracking URI
MODEL_REGISTRY = Path("mlruns")
Path(MODEL_REGISTRY).mkdir(exist_ok=True) # create experiments dir
mlflow.set_tracking_uri("file://" + str(MODEL_REGISTRY.absolute()))


In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def objective(trial):
    """Define the objective function"""
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
    }

    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_over, y_over)

    y_pred = optuna_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
        
        
    return accuracy

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from optuna.integration.mlflow import MLflowCallback

pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
study = optuna.create_study(study_name="xgboost_optimization", direction="maximize", pruner=pruner)
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name="accuracy")

study.optimize(objective,
            n_trials=2,
            callbacks=[mlflow_callback]
            )

[I 2023-07-24 11:35:15,399] A new study created in memory with name: xgboost_optimization
  mlflow_callback = MLflowCallback(


: 

: 

In [None]:
import json

print (f"Best value (f1): {study.best_trial.value}")
print (f"Best hyperparameters: {json.dumps(study.best_trial.params, indent=2)}")


Best value (f1): 1.0
Best hyperparameters: {
  "max_depth": 9,
  "learning_rate": 0.615636797851735,
  "n_estimators": 109,
  "min_child_weight": 4,
  "gamma": 0.30684047192142766,
  "subsample": 0.4548792950777823,
  "colsample_bytree": 0.527417518459473,
  "reg_alpha": 0.46500430027760475,
  "reg_lambda": 0.6955690012443138
}


In [None]:

params = study.best_trial.params
mlflow.autolog()

optuna_model = XGBClassifier(**params, n_jobs = -1)
optuna_model.fit(X_over, y_over)

y_pred = optuna_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

2023/07/24 11:29:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/07/24 11:29:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2023/07/24 11:29:27 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '63cbc09b88ba4d3da2da39b6a69c5ce7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


: 

: 