In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, "../src/features")
import data_cleaning

# allows all columns to be displayed
pd.set_option('display.max_columns', None)

OFFER_DATA_DIR = "../data/offer_acceptance_offers.csv"
ORDER_DATA_DIR = "../data/offer_acceptance_orders.csv"

offers = pd.read_csv(OFFER_DATA_DIR, low_memory=False)[["CARRIER_ID", "REFERENCE_NUMBER", "CREATED_ON_HQ", "RATE_USD", "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]]
orders = pd.read_csv(ORDER_DATA_DIR, low_memory=False)[["REFERENCE_NUMBER", "ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]]

offers = data_cleaning.change_to_date(offers, ["CREATED_ON_HQ"])
orders = data_cleaning.change_to_date(orders, ["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"])

orders = data_cleaning.parse_zipcode(orders)

orders = data_cleaning.parse_datetime(orders)

offers = data_cleaning.flatten_ref_num(offers)
orders = data_cleaning.flatten_ref_num(orders)

merged = data_cleaning.join_offers_orders(offers, orders, how="inner")

merged = data_cleaning.get_remaining_time(merged)

merged = data_cleaning.during_business_hours(merged)

pooled = data_cleaning.get_prorated_rate(merged)

merged = data_cleaning.impute_mileage(merged)

merged = data_cleaning.get_business_hours(merged)

In [3]:
 # filter for delivered offers
 merged = merged[merged["LOAD_DELIVERED_FROM_OFFER"]].reset_index(drop=True)

# drop unnecessary columns
 merged = merged.drop(["CARRIER_ID", "REFERENCE_NUMBER", "REMAINIG_TIME", "RATE_USD", "LOAD_DELIVERED_FROM_OFFER", "CREATED_ON_HQ", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP"], axis=1)

In [11]:

df_y = df["POOLED"]
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

MAX_CATEGORIES = 30

df = merged.drop(["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"], axis=1)
df["POOLED"] = df["OFFER_TYPE"] == "pool"
df = df.drop("OFFER_TYPE", axis=1)

df_X = df.drop("POOLED", axis=1)
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=42)

num_feat = ["BUSINESS_HOURS", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET", "BUSINESS_HOURS_ORDER_PICKUP"]
num_transformer = Pipeline(steps=[
    ('scaler', MaxAbsScaler()) # z-scale
])

cat_feat = ["ORIGIN_CITY", "DESTINATION_CITY", "ORDER_DAY", "ORDER_MONTH", "ORDER_HOUR", "PICKUP_DAY", "PICKUP_MONTH", "PICKUP_HOUR"]
cat_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(max_categories = 30, handle_unknown="ignore"))     # output from Ordinal becomes input to OneHot
])

# preprocessing pipeline (put them together)
preproc = ColumnTransformer(
    transformers=[
        ("numerical", num_transformer, num_feat),
        ("categorization", cat_transformer, cat_feat)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preproc), 
    ("xgb", xgboost.XGBClassifier())
])

param_grid = {
    'xgb__max_depth': [2, 3, 5, 7, 10],
    'xgb__n_estimators': [10, 100, 500],
    "xgb__reg_alpha": [0, 0.5, 1, 5],
    "xgb__reg_lambda": [0, 0.5, 1, 5]
}

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(
    pipeline,
    param_grid = param_grid,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 5,
    verbose = 3,
)

grid.fit(df_X_train, df_y_train)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV 1/5] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0;, score=0.785 total time=   0.6s
[CV 2/5] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0;, score=0.783 total time=   0.6s
[CV 3/5] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0;, score=0.789 total time=   0.8s
[CV 4/5] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0;, score=0.766 total time=   0.8s
[CV 1/5] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0.5;, score=0.785 total time=   0.8s
[CV 2/5] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0.5;, score=0.783 total time=   0.8s
[CV 5/5] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0;, score=0.782 total time=   1.0s
[CV 3/5] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__re

In [13]:
print(grid.best_params_)

{'xgb__max_depth': 10, 'xgb__n_estimators': 100, 'xgb__reg_alpha': 1, 'xgb__reg_lambda': 5}


In [14]:
pipeline = Pipeline(steps=[
    ('preprocessor', preproc), 
    ("xgb", xgboost.XGBClassifier(max_depth=10, n_estimators=100, reg_alpha=1, reg_lambda=5))
])

pipeline.fit(df_X_train, df_y_train)
y_preds = pipeline.predict(df_X_test)
acc = accuracy_score(y_preds, df_y_test)
acc

0.8260964607221637

In [10]:
# classifiers = []

# xgb_model = xgboost.XGBClassifier()
# classifiers.append(xgb_model)

# # svm_model = svm.SVC()
# # classifiers.append(svm_model)

# # tree_model = tree.DecisionTreeClassifier()
# # classifiers.append(tree_model)

# # forest_model = RandomForestClassifier()
# # classifiers.append(forest_model)

# for clf in classifiers:
#     pl = Pipeline(steps=[('preprocessor', preproc), (str(clf), clf)])
    
#     pl.fit(df_X_train, df_y_train)
#     y_preds = pl.predict(df_X_test)
#     acc = accuracy_score(y_preds, df_y_test)
#     print("Accuracy of %s is %s"%(clf, acc))

Accuracy of XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...) is 0.8223803183561534


In [None]:
Change DateTime columns to unix timestamp

SyntaxError: invalid syntax (353716839.py, line 1)