In [326]:
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import model_selection
from sklearn import linear_model # 63%
from sklearn import tree # 76%
from sklearn import svm # Linear 65%
from sklearn import naive_bayes # 63%
from sklearn import neighbors # 73% first try
from sklearn import neural_network
from sklearn import ensemble
from sklearn import metrics

In [295]:
import pandas as pd
import numpy as np
from haversine import haversine, Unit
import xgboost

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(context='paper', font_scale=1.15)

data_dir = '../data/datathon_SC_ACN_22/'

### Data overview

In [296]:
orders_data = pd.read_csv(data_dir + 'orders.csv', delimiter=';', index_col='order_id')
cities_data = pd.read_csv(data_dir + 'cities_data.csv', delimiter=';')
product_data = pd.read_csv(data_dir + 'product_attributes.csv', delimiter=',', index_col='product_id')

### Feature engineering

- Lat long for all three locations
- Distance on water
- Distance on land
- Distance on land **log**
- Units
- Units **log**
- Third party **ONH**
- Customs **ONH**
- Material handling **ONH**
- Weight
- Weight **log**

In [297]:
# Replace ATHENAS with Athens and BCN with Barcelona in the origin_port column
orders_data['origin_port'] = orders_data['origin_port'].replace('ATHENAS', 'Athens')
orders_data['origin_port'] = orders_data['origin_port'].replace('BCN', 'Barcelona')

In [298]:
cities_data1 = cities_data[['city_from_name', 'city_from_coord']]
cities_data2 = cities_data[['city_to_name', 'city_to_coord']]
cities_data1 = cities_data1.rename(columns={'city_from_name': 'city_name', 'city_from_coord': 'city_coord'})
cities_data2 = cities_data2.rename(columns={'city_to_name': 'city_name', 'city_to_coord': 'city_coord'})

cities_data = pd.concat([cities_data1, cities_data2]).drop_duplicates().set_index('city_name')

# Transform lat long tuple as string to two columns
cities_data['lat'] = cities_data['city_coord'].apply(lambda x: float(x[1:-1].split(',')[0]))
cities_data['long'] = cities_data['city_coord'].apply(lambda x: float(x[1:-1].split(',')[1]))
cities_data = cities_data.drop(columns=['city_coord'])

In [299]:
# Filter only for those extracted columns
city_lat_longs = cities_data[['lat', 'long']].drop_duplicates()

# Add to order df
orders_data = orders_data.join(city_lat_longs.rename(columns={'lat': 'start_lat', 'long': 'start_long'}), on='origin_port')
orders_data = orders_data.join(city_lat_longs.rename(columns={'lat': 'hub_lat', 'long': 'hub_long'}), on='logistic_hub')
orders_data = orders_data.join(city_lat_longs.rename(columns={'lat': 'end_lat', 'long': 'end_long'}), on='customer')

orders_data.loc[orders_data['logistic_hub'].isna(), 'hub_lat'] = orders_data.loc[orders_data['logistic_hub'].isna(), 'start_lat']
orders_data.loc[orders_data['logistic_hub'].isna(), 'hub_long'] = orders_data.loc[orders_data['logistic_hub'].isna(), 'start_long']

In [300]:
# Calculate distances
orders_data['distance_on_water'] = orders_data.apply(lambda x: haversine((x['start_lat'], x['start_long']), (x['hub_lat'], x['hub_long'])), axis=1)
orders_data['distance_on_land'] = orders_data.apply(lambda x: haversine((x['hub_lat'], x['hub_long']), (x['end_lat'], x['end_long'])), axis=1)
orders_data.loc[orders_data['logistic_hub'].isna(), 'distance_on_land'] = orders_data.loc[orders_data['logistic_hub'].isna()].apply(lambda x: haversine((x['start_lat'], x['start_long']), (x['end_lat'], x['end_long'])), axis=1)

# Log version of distance on land
orders_data['distance_on_land_log'] = orders_data['distance_on_land'].apply(lambda x: np.log(x))

In [301]:
# Log version of units of order
orders_data['units_log'] = orders_data['units'].apply(lambda x: np.log(x))

orders_data = orders_data.join(product_data, on='product_id')

# Log version of product weight
orders_data['weight_log'] = orders_data['weight'].apply(lambda x: np.log(x))

# Convert material handling class as int to string class
orders_data['material_handling'] = orders_data['material_handling'].apply(lambda x: "c" + str(x))

In [302]:
# Handle no hub orders
# if hub == nan then replace sea_distance with 0
orders_data['distance_on_water'] = orders_data['distance_on_water'].fillna(0)

In [303]:
orders_data.loc[orders_data['distance_on_land'] == 0, 'distance_on_land_log'] = 0

In [304]:
# Drop nas, which in this case should only be products without information
orders_data = orders_data.dropna()

In [305]:
orders_target = orders_data['late_order']
orders_features = orders_data.drop(columns=['origin_port', 'logistic_hub', 'customer', 'late_order', 'product_id'])

orders_features = pd.get_dummies(orders_features)

In [306]:
orders_features.columns

Index(['units', 'start_lat', 'start_long', 'hub_lat', 'hub_long', 'end_lat',
       'end_long', 'distance_on_water', 'distance_on_land',
       'distance_on_land_log', 'units_log', 'weight', 'weight_log',
       '3pl_v_001', '3pl_v_002', '3pl_v_003', '3pl_v_004',
       'customs_procedures_CRF', 'customs_procedures_DTD',
       'customs_procedures_DTP', 'material_handling_c0.0',
       'material_handling_c1.0', 'material_handling_c2.0',
       'material_handling_c3.0', 'material_handling_c4.0',
       'material_handling_c5.0'],
      dtype='object')

#### Basic test model

In [316]:
orders_features_not_delayed = orders_features.loc[orders_data['late_order'] == False].sample(frac=0.4)
orders_features_delayed = orders_features.loc[orders_data['late_order'] == True]

#Combine
order_features_balanced = pd.concat([orders_features_not_delayed, orders_features_delayed]).sample(frac=1)
order_target_balanced = order_features_balanced.join(orders_target)['late_order']

In [317]:
orders_target_int = orders_target * 1

In [323]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(orders_features, orders_target_int, test_size=0.2, random_state=0)

### Ensemble performance

In [337]:
model_ada = ensemble.AdaBoostClassifier(n_estimators=100, learning_rate=1.5)
model_rf = ensemble.RandomForestClassifier(max_depth=20)
model_xgb = xgboost.XGBClassifier(
    scale_pos_weight=0.4,
    n_estimators=300,
    max_depth=5,
    learning_rate=0.15,
    subsample=1.0,
    colsample_bytree=0.8,
    eval_metric='logloss',
    use_label_encoder=False
)
model_nnb = neighbors.KNeighborsClassifier()
model_nn = neural_network.MLPClassifier(hidden_layer_sizes=(10,30,10), learning_rate='adaptive', max_iter=1000)

voting_classifier = ensemble.VotingClassifier(estimators=[
    ('ada', model_ada),
    ('rf', model_rf),
    ('xgb', model_xgb),
    ('nnb', model_nnb),
    ('nn', model_nn)], voting='hard')

pipe = pipeline.make_pipeline(preprocessing.StandardScaler(), voting_classifier)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8194235699761294

In [338]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, pipe.predict(X_test)).ravel()
tn, tp, fn, fp

(16718, 1819, 3560, 525)