In [2]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(context='paper', font_scale=1.15)

data_dir = '../data/datathon_SC_ACN_22/'

### Data overview

In [3]:
orders_data = pd.read_csv(data_dir + 'orders.csv', delimiter=';', index_col='order_id')
orders_data.sample(n=5)

Unnamed: 0_level_0,origin_port,3pl,customs_procedures,logistic_hub,customer,product_id,units,late_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
a3ac8de6ae23,Rotterdam,v_003,CRF,Warsaw,Malmö,1689788,492,True
0372b2528798,Athens,v_002,CRF,Venlo,Marseille,1681890,479,False
75efcf2c730e,Rotterdam,v_001,DTD,Hamburg,Cologne,1666317,647,True
e30a2d0101a6,Rotterdam,v_004,CRF,Venlo,Valencia,1700869,425,False
037246d6acd3,Rotterdam,v_003,CRF,Warsaw,Vienna,1692949,394,False


#### Adding shipment distance

In [4]:
cities_data = pd.read_csv(data_dir + 'cities_data.csv', delimiter=';', index_col=['city_from_name', 'city_to_name'])

In [5]:
orders_data = orders_data.join(cities_data['distance'].rename('port_to_hub'), on=['origin_port', 'logistic_hub'])
orders_data = orders_data.join(cities_data['distance'].rename('hub_to_customer'), on=['logistic_hub', 'customer'])
orders_data['total_shipping_distance'] = orders_data['port_to_hub'] + orders_data['hub_to_customer']

#### Adding material handling and weight

In [6]:
product_data = pd.read_csv(data_dir + 'product_attributes.csv', delimiter=',', index_col='product_id')
orders_data = orders_data.join(product_data, on='product_id')

#### Adding countries

In [7]:
cities = pd.read_csv('../data/external/world-cities.csv', index_col='name')['country']
orders_data = orders_data.join(cities.rename('origin_country'), on='origin_port')
orders_data = orders_data.join(cities.rename('hub_country'), on='logistic_hub')
orders_data = orders_data.join(cities.rename('customer_country'), on='customer')

In [10]:
orders_data['no_diff_countries'] = orders_data[['origin_country', 'hub_country', 'customer_country']].apply(axis='columns', func=lambda x: len(set(x)))

### Testing for basic correlation

In [11]:
correlations = pd.get_dummies(orders_data).corr()['late_order']
correlations.sort_values(ascending=False)

late_order                    1.000000
origin_port_Athens            0.182718
units                         0.162393
total_shipping_distance       0.108047
logistic_hub_Lille            0.096930
                                ...   
hub_country_Netherlands      -0.057122
customs_procedures_CRF       -0.083705
3pl_v_002                    -0.089864
origin_country_Netherlands   -0.103892
origin_port_Rotterdam        -0.182620
Name: late_order, Length: 91, dtype: float64