In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, "../src/features")
import data_cleaning

# allows all columns to be displayed
pd.set_option('display.max_columns', None)

OFFER_DATA_DIR = "../data/offer_acceptance_offers.csv"
ORDER_DATA_DIR = "../data/offer_acceptance_orders.csv"

offers = pd.read_csv(OFFER_DATA_DIR, low_memory=False)[["CARRIER_ID", "REFERENCE_NUMBER", "CREATED_ON_HQ", "RATE_USD", "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]]
orders = pd.read_csv(ORDER_DATA_DIR, low_memory=False)[["REFERENCE_NUMBER", "ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]]

offers = data_cleaning.change_to_date(offers, ["CREATED_ON_HQ"])
orders = data_cleaning.change_to_date(orders, ["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"])

orders = data_cleaning.parse_zipcode(orders)

orders = data_cleaning.parse_datetime(orders)

offers = data_cleaning.flatten_ref_num(offers)
orders = data_cleaning.flatten_ref_num(orders)

merged = data_cleaning.join_offers_orders(offers, orders, how="inner")

merged = data_cleaning.get_remaining_time(merged)

merged = data_cleaning.during_business_hours(merged)

pooled = data_cleaning.get_prorated_rate(merged)

merged = data_cleaning.impute_mileage(merged)

merged = data_cleaning.get_business_hours(merged)

In [3]:
 # filter for delivered offers
 merged = merged[merged["LOAD_DELIVERED_FROM_OFFER"]].reset_index(drop=True)

# drop unnecessary columns
 merged = merged.drop(["CARRIER_ID", "REFERENCE_NUMBER", "REMAINIG_TIME", "RATE_USD", "LOAD_DELIVERED_FROM_OFFER", "CREATED_ON_HQ", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP"], axis=1)

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

MAX_CATEGORIES = 30

df = merged.drop(["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"], axis=1)
df["POOLED"] = df["OFFER_TYPE"] == "pool"
df = df.drop("OFFER_TYPE", axis=1)

df_X = df.drop("POOLED", axis=1)
df_y = df["POOLED"]
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=42)

num_feat = ["BUSINESS_HOURS", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET", "BUSINESS_HOURS_ORDER_PICKUP"]
num_transformer = Pipeline(steps=[
    ('scaler', MaxAbsScaler()) # z-scale
])

cat_feat = ["ORIGIN_CITY", "DESTINATION_CITY", "ORDER_DAY", "ORDER_MONTH", "ORDER_HOUR", "PICKUP_DAY", "PICKUP_MONTH", "PICKUP_HOUR"]
cat_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(max_categories = 30, handle_unknown="ignore"))     # output from Ordinal becomes input to OneHot
])

# preprocessing pipeline (put them together)
preproc = ColumnTransformer(
    transformers=[
        ("numerical", num_transformer, num_feat),
        ("categorization", cat_transformer, cat_feat)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preproc), 
    ("xgb", xgboost.XGBClassifier())
])

param_grid = {
    'xgb__max_depth': [2, 3, 5, 7, 10],
    'xgb__n_estimators': [10, 100, 500],
    "xgb__reg_alpha": [0, 0.5, 1, 5],
    "xgb__reg_lambda": [0, 0.5, 1, 5]
}

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(
    pipeline,
    param_grid = param_grid,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 3,
    verbose = 3,
)

grid.fit(df_X_train, df_y_train)

Fitting 3 folds for each of 240 candidates, totalling 720 fits
[CV 2/3] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0;, score=0.783 total time=   0.6s
[CV 1/3] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0;, score=0.785 total time=   0.6s
[CV 3/3] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0;, score=0.787 total time=   0.7s
[CV 1/3] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0.5;, score=0.785 total time=   0.7s
[CV 2/3] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0.5;, score=0.783 total time=   0.8s
[CV 3/3] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=0.5;, score=0.787 total time=   0.7s
[CV 1/3] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__reg_lambda=1;, score=0.785 total time=   0.7s
[CV 2/3] END xgb__max_depth=2, xgb__n_estimators=10, xgb__reg_alpha=0, xgb__r

In [None]:
import pandas as pd 
import xgboost
from sklearn.model_selection import train_test_split

import pandas as pd

import sys
sys.path.insert(1, "../src/features")
import data_cleaning

# allows all columns to be displayed
pd.set_option('display.max_columns', None)

OFFER_DATA_DIR = "../data/offer_acceptance_offers.csv"
ORDER_DATA_DIR = "../data/offer_acceptance_orders.csv"

offers = pd.read_csv(OFFER_DATA_DIR, low_memory=False)[["CARRIER_ID", "REFERENCE_NUMBER", "CREATED_ON_HQ", "RATE_USD", "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]]
orders = pd.read_csv(ORDER_DATA_DIR, low_memory=False)[["REFERENCE_NUMBER", "ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]]

offers = data_cleaning.change_to_date(offers, ["CREATED_ON_HQ"])
orders = data_cleaning.change_to_date(orders, ["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"])

orders = data_cleaning.parse_zipcode(orders)

orders = data_cleaning.parse_datetime(orders)

offers = data_cleaning.flatten_ref_num(offers)
orders = data_cleaning.flatten_ref_num(orders)

merged = data_cleaning.join_offers_orders(offers, orders, how="inner")

merged = data_cleaning.get_remaining_time(merged)

merged = data_cleaning.during_business_hours(merged)

pooled = data_cleaning.get_prorated_rate(merged)

merged = data_cleaning.impute_mileage(merged)

merged = data_cleaning.get_business_hours(merged)

# filter for delivered offers
merged = merged[merged["LOAD_DELIVERED_FROM_OFFER"]].reset_index(drop=True)

# drop unnecessary columns
merged = merged.drop(["CARRIER_ID", "REFERENCE_NUMBER", "REMAINIG_TIME", "RATE_USD", "LOAD_DELIVERED_FROM_OFFER", "CREATED_ON_HQ", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP"], axis=1)

In [10]:
pipeline = Pipeline(steps=[
    ('preprocessor', preproc), 
    ("xgb", xgboost.XGBClassifier(max_depth=7, n_estimators=100, reg_alpha=5, reg_lambda=0))
])

pipeline.fit(df_X_train, df_y_train)
y_preds = pipeline.predict(df_X_test)
acc = accuracy_score(y_preds, df_y_test)
acc

0.8227632033488448

In [10]:
# classifiers = []

# xgb_model = xgboost.XGBClassifier()
# classifiers.append(xgb_model)

# # svm_model = svm.SVC()
# # classifiers.append(svm_model)

# # tree_model = tree.DecisionTreeClassifier()
# # classifiers.append(tree_model)

# # forest_model = RandomForestClassifier()
# # classifiers.append(forest_model)

# for clf in classifiers:
#     pl = Pipeline(steps=[('preprocessor', preproc), (str(clf), clf)])
    
#     pl.fit(df_X_train, df_y_train)
#     y_preds = pl.predict(df_X_test)
#     acc = accuracy_score(y_preds, df_y_test)
#     print("Accuracy of %s is %s"%(clf, acc))

Accuracy of XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...) is 0.8223803183561534


In [2]:
import pandas as pd 
from sklearn import svm, tree
import xgboost
from sklearn.model_selection import train_test_split

import pandas as pd

# import sys
# sys.path.insert(1, "../src/features")
# import data_cleaning

# # allows all columns to be displayed
# pd.set_option('display.max_columns', None)

# OFFER_DATA_DIR = "../data/offer_acceptance_offers.csv"
# ORDER_DATA_DIR = "../data/offer_acceptance_orders.csv"

# offers = pd.read_csv(OFFER_DATA_DIR, low_memory=False)[["CARRIER_ID", "REFERENCE_NUMBER", "CREATED_ON_HQ", "RATE_USD", "OFFER_TYPE", "LOAD_DELIVERED_FROM_OFFER"]]
# orders = pd.read_csv(ORDER_DATA_DIR, low_memory=False)[["REFERENCE_NUMBER", "ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST", "ORIGIN_3DIGIT_ZIP", "DESTINATION_3DIGIT_ZIP", "APPROXIMATE_DRIVING_ROUTE_MILEAGE", "PALLETIZED_LINEAR_FEET"]]

# offers = data_cleaning.change_to_date(offers, ["CREATED_ON_HQ"])
# orders = data_cleaning.change_to_date(orders, ["ORDER_DATETIME_PST", "PICKUP_DEADLINE_PST"])

# orders = data_cleaning.parse_zipcode(orders)

# orders = data_cleaning.parse_datetime(orders)

# offers = data_cleaning.flatten_ref_num(offers)
# orders = data_cleaning.flatten_ref_num(orders)

# merged = data_cleaning.join_offers_orders(offers, orders, how="inner")

# merged = data_cleaning.get_remaining_time(merged)

# merged = data_cleaning.during_business_hours(merged)

# pooled = data_cleaning.get_prorated_rate(merged)

# merged = data_cleaning.impute_mileage(merged)

# merged = data_cleaning.get_business_hours(merged)

# merged.to_pickle("../data/pickels/merged.pkl")
# pooled.to_pickle("../data/pickels/pooled.pkl")
# orders.to_pickle("../data/pickels/orders.pkl")
# offers.to_pickle("../data/pickels/offers.pkl")

merged = pd.read_pickle("../data/pickels/merged.pkl") 
pooled = pd.read_pickle("../data/pickels/pooled.pkl") 
orders = pd.read_pickle("../data/pickels/orders.pkl") 
offers = pd.read_pickle("../data/pickels/offers.pkl") 

sys.path.insert(1, "../src/models")
import pooled_proba_model 

model, (df_X_test, df_y_test) = pooled_proba_model.generate_probability_pipeline(merged)

In [3]:
df_X_test

Unnamed: 0,LOAD_DELIVERED_FROM_OFFER,APPROXIMATE_DRIVING_ROUTE_MILEAGE,PALLETIZED_LINEAR_FEET,ORIGIN_CITY,DESTINATION_CITY,ORDER_DAY,ORDER_MONTH,ORDER_HOUR,PICKUP_DAY,PICKUP_MONTH,PICKUP_HOUR,BUSINESS_HOURS,BUSINESS_HOURS_ORDER_PICKUP
129486,True,825.0,52.0,Dallas TX,Atlanta GA,5,8,4,1,8,11,True,13.000000
103753,True,2482.0,14.0,Spokane WA,Utica NY,2,4,14,3,4,17,True,12.673611
92224,True,1164.0,21.0,Kansas City KS,Syracuse NY,4,12,11,4,12,15,True,53.149167
123221,True,287.0,16.0,Winston-Salem NC,Athens GA,2,1,10,3,1,19,True,17.702222
161637,True,2702.0,10.0,Riverside CA,Miami FL,1,9,15,4,10,17,True,31.401389
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,True,1957.0,7.0,Riverside CA,Chicago IL,1,1,12,3,1,12,True,19.118333
24151,True,2040.0,7.0,Atlanta GA,Las Vegas NV,3,9,15,3,10,20,True,52.567500
128109,True,397.0,52.0,Atlanta GA,Ocala FL,2,1,14,0,1,10,True,25.400278
106977,True,574.0,42.0,Columbia SC,Paoli PA,0,1,12,2,1,11,True,18.432500


In [None]:
model.predict_proba()