In [1]:
import os
import pandas as pd
import numpy as np
from haversine import haversine, Unit
import xgboost
import pickle as pk
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import model_selection
from sklearn import linear_model
from sklearn import tree
from sklearn import svm
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn import neural_network
from sklearn import ensemble
from sklearn import metrics

from data import feature_engineering

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(context='paper', font_scale=1.15)

data_dir = '../data/datathon_SC_ACN_22/'

  from pandas import MultiIndex, Int64Index


### Data overview

In [2]:
orders_data = pd.read_csv(data_dir + 'orders.csv', delimiter=';', index_col='order_id')
cities_data = pd.read_csv(data_dir + 'cities_data.csv', delimiter=';')
product_data = pd.read_csv(data_dir + 'product_attributes.csv', delimiter=',', index_col='product_id')

In [3]:
orders_features, orders_targets = feature_engineering.engineer_features(orders_data, cities_data, product_data, training_data=True)

In [4]:
orders_features.columns

Index(['units', 'start_lat', 'start_long', 'hub_lat', 'hub_long', 'end_lat',
       'end_long', 'distance_on_water', 'distance_on_land',
       'distance_on_land_log', 'distance_on_land_squared', 'units_log',
       'units_squared', 'weight', 'weight_log', 'weight_squared',
       'weight_deviation', 'missing_product_info', '3pl_v_001', '3pl_v_002',
       '3pl_v_003', '3pl_v_004', 'customs_procedures_CRF',
       'customs_procedures_DTD', 'customs_procedures_DTP',
       'material_handling_c0.0', 'material_handling_c1.0',
       'material_handling_c2.0', 'material_handling_c3.0',
       'material_handling_c4.0', 'material_handling_c5.0',
       'material_handling_cnan'],
      dtype='object')

##### Training - test split

In [5]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(orders_features, orders_targets, test_size=0.2, random_state=0)

### Model

In [6]:
model_ada = ensemble.AdaBoostClassifier(
    n_estimators=100,
    learning_rate=1.5
)

model_rf = ensemble.RandomForestClassifier(
    max_depth=20
)

model_xgb = xgboost.XGBClassifier(
    scale_pos_weight=0.4,
    n_estimators=300,
    max_depth=5,
    learning_rate=0.15,
    subsample=1.0,
    colsample_bytree=0.8,
    eval_metric='logloss',
    use_label_encoder=False
)

model_nnb = neighbors.KNeighborsClassifier()

model_nn = neural_network.MLPClassifier(
    hidden_layer_sizes=(10,30,10),
    learning_rate='adaptive',
    max_iter=1000
)

In [7]:
voting_classifier = ensemble.VotingClassifier(estimators=[
    ('ada', model_ada),
    ('rf', model_rf),
    ('xgb', model_xgb),
    ('nnb', model_nnb),
    ('nn', model_nn)], voting='soft')

pipe = pipeline.make_pipeline(preprocessing.StandardScaler(), voting_classifier)
pipe.fit(X_train, y_train)

pipe.score(X_test, y_test)

0.821184809240462

In [8]:
pipe.predict_proba(X_test)

array([[0.66808931, 0.3319107 ],
       [0.59558693, 0.40441307],
       [0.64957887, 0.35042113],
       ...,
       [0.84766091, 0.15233909],
       [0.80460728, 0.19539272],
       [0.78253813, 0.21746187]])

In [9]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, pipe.predict(X_test)).ravel()
tn, tp, fn, fp

(17017, 1752, 3610, 477)

#### Save model

In [10]:
if not os.path.exists('models'):
    os.mkdir('models')

with open('../models/ensemble_classifier_v001.pkl', 'wb') as f:
    pk.dump(pipe, f)