In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, "../src/features")
import data_cleaning

# allows all columns to be displayed
pd.set_option('display.max_columns', None)

OFFER_DATA_DIR = "../data/offer_acceptance_offers.csv"
ORDER_DATA_DIR = "../data/offer_acceptance_orders.csv"

offers = pd.read_csv(OFFER_DATA_DIR, low_memory=False)[["CARRIER_ID", "REFERENCE_NUMBER", "CREATED_ON_HQ", "RATE_USD", "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]]
orders = pd.read_csv(ORDER_DATA_DIR, low_memory=False)[["REFERENCE_NUMBER", "ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]]

offers = data_cleaning.change_to_date(offers, ["CREATED_ON_HQ"])
orders = data_cleaning.change_to_date(orders, ["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"])

orders = data_cleaning.parse_zipcode(orders)

offers = data_cleaning.flatten_ref_num(offers)
orders = data_cleaning.flatten_ref_num(orders)

merged = data_cleaning.join_offers_orders(offers, orders, how="inner")

merged = data_cleaning.get_remaining_time(merged)

merged = data_cleaning.during_business_hours(merged)

pooled = data_cleaning.get_prorated_rate(merged)

merged = data_cleaning.impute_mileage(merged)

In [None]:
 # filter for delivered offers
 merged = merged[merged["LOAD_DELIVERED_FROM_OFFER"]].reset_index(drop=True)

# drop unnecessary columns
 merged = merged.drop(["CARRIER_ID", "REFERENCE_NUMBER", "RATE_USD", "LOAD_DELIVERED_FROM_OFFER", "CREATED_ON_HQ", "REMAINING_TIME", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP"], axis=1)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

MAX_CATEGORIES = 30

df = merged.drop(["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "REMAINIG_TIME"], axis=1)
df["POOLED"] = df["OFFER_TYPE"] == "pool"
df = df.drop("OFFER_TYPE", axis=1)

df_X = df.drop("POOLED", axis=1)
df_y = df["POOLED"]

df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=42)

num_feat = ["BUSINESS_HOURS", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]
num_transformer = Pipeline(steps=[
    ('scaler', MaxAbsScaler()) # z-scale
])

cat_feat = ['ORIGIN_CITY', 'DESTINATION_CITY']
cat_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(max_categories = 30))     # output from Ordinal becomes input to OneHot
])

# preprocessing pipeline (put them together)
preproc = ColumnTransformer(
    transformers=[
        ("numerical", num_transformer, num_feat),
        ("categorization", cat_transformer, cat_feat)
    ])

pl = Pipeline(steps=[('preprocessor', preproc), ("tree", RandomForestClassifier())])

In [None]:
classifiers = []

xgb_model = xgboost.XGBClassifier()
classifiers.append(xgb_model)

svm_model = svm.SVC()
classifiers.append(svm_model)

tree_model = tree.DecisionTreeClassifier()
classifiers.append(tree_model)

forest_model = RandomForestClassifier()
classifiers.append(forest_model)

for clf in classifiers:
    pl = Pipeline(steps=[('preprocessor', preproc), (str(clf), clf)])
    
    pl.fit(df_X_train, df_y_train)
    y_preds = pl.predict(df_X_test)
    acc = accuracy_score(y_preds, df_y_test)
    print("Accuracy of %s is %s"%(clf, acc))

Accuracy of XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...) is 0.8017328749428668
Accuracy of SVC() is 0.7876830746606784
Accuracy of DecisionTreeClassifier() is 0.77456727807476
Accuracy of RandomForestClassifier() is 0.7814431350728324


In [None]:
Change DateTime columns to unix timestamp
Filter out Remaining_Time < 1

SyntaxError: invalid syntax (353716839.py, line 1)

In [None]:
merged["ORDER_DAY"] = merged["ORDER_DATETIME_PST"].dt.dayofweek
merged["ORDER_MONTH"] = merged["ORDER_DATETIME_PST"].dt.month
merged["ORDER_HOUR"] = merged["ORDER_DATETIME_PST"].dt.hour

merged["PICKUP_DAY"] = merged["PICKUP_DEADLINE_PST"].dt.dayofweek
merged["PICKUP_MONTH"] = merged["PICKUP_DEADLINE_PST"].dt.month
merged["PICKUP_HOUR"] = merged["PICKUP_DEADLINE_PST"].dt.hour

In [None]:
merged

Unnamed: 0,OFFER_TYPE,ORDER_DATETIME_PST,PICKUP_DEADLINE_PST,APPROXIMATE_DRIVING_ROUTE_MILEAGE,PALLETIZED_LINEAR_FEET,ORIGIN_CITY,DESTINATION_CITY,REMAINIG_TIME,BUSINESS_HOURS,ORDER_DAY,ORDER_MONTH,ORDER_HOUR
0,pool,2021-11-02 12:56:49,2021-11-03 16:00:00,2140.0,4.0,Oakland CA,Chicago IL,0 days 07:02:33,True,1,11,12
1,pool,2021-11-03 06:00:00,2021-11-04 15:00:00,1748.0,44.0,Other,Other,1 days 06:02:33,True,2,11,6
2,quote,2022-07-21 06:17:07,2022-07-28 13:00:00,325.0,14.0,Rancho Cucamonga CA,Glendale AZ,5 days 10:45:21,False,3,7,6
3,quote,2022-07-26 10:22:11,2022-07-28 14:00:00,361.0,31.0,Whittier CA,Phoenix AZ,1 days 00:46:38,True,1,7,10
4,pool,2022-08-18 06:59:27,2022-08-24 16:30:00,344.0,8.0,Fresno CA,San Diego CA,5 days 06:50:19,True,3,8,6
...,...,...,...,...,...,...,...,...,...,...,...,...
167730,quote,2022-04-06 08:09:25,2022-04-14 14:00:00,33.0,12.0,Bakersfield CA,Bakersfield CA,6 days 19:11:28,True,2,4,8
167731,quote,2021-02-24 11:20:54,2021-03-01 15:00:00,392.0,53.0,Other,Other,0 days 03:12:01,True,2,2,11
167732,quote,2020-12-11 10:24:04,2020-12-17 11:00:00,535.0,43.0,Greensboro NC,Indianapolis IN,1 days 03:09:31,True,4,12,10
167733,quote,2021-02-04 10:18:13,2021-02-12 11:00:00,277.0,37.0,Dallas TX,San Antonio TX,0 days 02:32:28,True,3,2,10
