In [11]:
import pandas as pd
import numpy as np
import time
import datetime
import pickle
import random
import os
import logging
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
# logging.basicConfig(level=logging.INFO, filename='log_Model', format=LOG_FORMAT)
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)

from IPython.display import display, HTML
def show_df(df):
    display(HTML(df.to_html()))

# 加载数据

In [12]:
logging.info('start read data')

pickle_path = './0_data_sort_out/train_dataset_mp.pickle'
with open(pickle_path, 'rb') as f:
    train_dataset_mp = pickle.load(f)
key_list_train, mp_action_train, mp_distance_train, mp_order_train, mp_distance_train_detail = \
train_dataset_mp['key_list'], train_dataset_mp['mp_action'], train_dataset_mp['mp_distance'], train_dataset_mp['mp_order'], train_dataset_mp['mp_distance_detail']

pickle_path = './0_data_sort_out/testA_dataset_mp.pickle'
with open(pickle_path, 'rb') as f:
    testA_dataset_mp = pickle.load(f)
key_list_testA, mp_action_testA, mp_distance_testA, mp_order_testA, mp_distance_testA_detail = \
testA_dataset_mp['key_list'], testA_dataset_mp['mp_action'], testA_dataset_mp['mp_distance'], testA_dataset_mp['mp_order'], testA_dataset_mp['mp_distance_detail']

pickle_path = './0_data_sort_out/train_dataset_df.pickle'
with open(pickle_path, 'rb') as f:
    train_dataset_df = pickle.load(f)
df_actions_train, df_couriers_train, df_distances_train, df_orders_train = \
train_dataset_df['df_actions'], train_dataset_df['df_couriers'], train_dataset_df['df_distances'], train_dataset_df['df_orders']
    
pickle_path = './0_data_sort_out/testA_dataset_df.pickle'
with open(pickle_path, 'rb') as f:
    testA_dataset_df = pickle.load(f)
df_actions_testA, df_couriers_testA, df_distances_testA, df_orders_testA = \
testA_dataset_df['df_actions'], testA_dataset_df['df_couriers'], testA_dataset_df['df_distances'], testA_dataset_df['df_orders']

logging.info('finish read data')

2021-01-19 16:27:00,936 - INFO - start read data
2021-01-19 16:27:23,986 - INFO - finish read data


In [13]:
pickle_path = './1_build_dataset/df_train_info.pickle'
with open(pickle_path, 'rb') as f:
    df_train_info = pickle.load(f)

pickle_path = './1_build_dataset/df_testA_info.pickle'
with open(pickle_path, 'rb') as f:
    df_testA_info = pickle.load(f)

In [14]:
delay_feature_list = []
logging.info('start delay feature build')

for i, (date, courier, wave_idx) in enumerate(key_list_train):
    if (i + 1) % 10000 == 0:
        logging.info('build %d train sample' % i)
    
    df_a_action = mp_action_train[date][courier][wave_idx]
    df_a_order = mp_order_train[date][courier][wave_idx]
    know_len = df_train_info.iloc[i].know_lens
    cnt_pickup_delay, cnt_deliver_delay = 0, 0
    delay_feature_dict = {}
    
    for j in range(know_len):
        action_row = df_a_action.iloc[j]
        tracking_id = action_row.tracking_id
        cur_time = action_row.expect_time
        df_order_info = df_a_order.query('tracking_id == @tracking_id')
        
        estimate_pick_time = df_order_info.estimate_pick_time
        promise_deliver_time = df_order_info.promise_deliver_time
        delay_feature_dict['courier_id'] = courier

        if action_row.action_type == 'PICKUP':
            delay_feature_dict['delay_type'] = 'PICKUP'
            delay_feature_dict['delay'] = (cur_time > estimate_pick_time).iloc[0]
            delay_feature_dict['delay_time'] = (cur_time - estimate_pick_time).iloc[0]
        elif action_row.action_type == 'DELIVERY':
            delay_feature_dict['delay_type'] = 'DELIVERY'
            delay_feature_dict['delay'] = (cur_time > promise_deliver_time).iloc[0]
            delay_feature_dict['delay_time'] = (cur_time - promise_deliver_time).iloc[0]
        delay_feature_list.append(delay_feature_dict)

for i, (date, courier, wave_idx) in enumerate(key_list_testA):
    if (i + 1) % 1000 == 0:
        logging.info('build %d testA sample' % i)
        
    df_a_action = mp_action_testA[date][courier][wave_idx]
    df_a_order = mp_order_testA[date][courier][wave_idx]
    know_len = df_testA_info.iloc[i].know_lens
    cnt_pickup_delay, cnt_deliver_delay = 0, 0
    delay_feature_dict = {}
    
    for j in range(know_len):
        action_row = df_a_action.iloc[j]
        tracking_id = action_row.tracking_id
        cur_time = action_row.expect_time
        df_order_info = df_a_order.query('tracking_id == @tracking_id')
        
        estimate_pick_time = df_order_info.estimate_pick_time
        promise_deliver_time = df_order_info.promise_deliver_time
        delay_feature_dict['courier_id'] = courier
        
        if action_row.action_type == 'PICKUP':
            delay_feature_dict['delay_type'] = 'PICKUP'
            delay_feature_dict['delay'] = (cur_time > estimate_pick_time).iloc[0]
            delay_feature_dict['delay_time'] = (cur_time - estimate_pick_time).iloc[0]
        elif action_row.action_type == 'DELIVERY':
            delay_feature_dict['delay_type'] = 'DELIVERY'
            delay_feature_dict['delay'] = (cur_time > promise_deliver_time).iloc[0]
            delay_feature_dict['delay_time'] = (cur_time - promise_deliver_time).iloc[0]
        
        delay_feature_list.append(delay_feature_dict)

logging.info('finish delay feature build')


def apply_delay(df):
    df_delivery = df.query('delay_type == "DELIVERY"')
    df_pickup = df.query('delay_type == "PICKUP"')
    feature_dict = {'delivery_num' : df_delivery.shape[0], 'pickup_num' : df_pickup.shape[0]}
    feature_dict['delivery_delay_count'] = sum(df_delivery['delay'])
    feature_dict['pickup_delay_count'] = sum(df_pickup['delay'])
    
    if feature_dict['pickup_num'] == 0:
        feature_dict['pickup_delay_time_avg'] = 0
        feature_dict['pickup_delay_rate'] = 0
    else:
        feature_dict['pickup_delay_time_avg'] = df_pickup['delay_time'].apply(lambda x : x if x > 0 else 0).mean()
        feature_dict['pickup_delay_rate'] = feature_dict['pickup_delay_count'] / feature_dict['pickup_num']
    
    if feature_dict['delivery_num'] == 0:
        feature_dict['delivery_delay_time_avg'] = 0
        feature_dict['delivery_delay_rate'] = 0
    else:
        feature_dict['delivery_delay_time_avg'] = df_delivery['delay_time'].apply(lambda x : x if x > 0 else 0).mean()
        feature_dict['delivery_delay_rate'] = feature_dict['delivery_delay_count'] / feature_dict['delivery_num']


    return pd.Series(feature_dict)
#增加骑手的延误特征

df_delay_info = pd.DataFrame(delay_feature_list)
df_courier_delay_info = df_delay_info.groupby('courier_id').apply(apply_delay)

courier_delay_features = ['pickup_delay_rate', 'delivery_delay_rate', 'pickup_delay_time_avg', 'delivery_delay_time_avg',
                         'delivery_delay_count', 'pickup_delay_count']

for delay_feature in courier_delay_features:
    df_couriers_train[delay_feature] = df_couriers_train['courier_id'].map(df_courier_delay_info[delay_feature])
    df_couriers_testA[delay_feature] = df_couriers_testA['courier_id'].map(df_courier_delay_info[delay_feature])  
df_couriers_train.head()

#映射为字典[courier_id][date]
def build_couriers_features(df_couriers, key_list):
    
    df_index_couriers = df_couriers.groupby('courier_id').apply(lambda x : x.groupby('date').apply(lambda x:x) )
    couriers_set = set(df_couriers['courier_id'])
    mp_couriers_features = {}
    for courier in couriers_set:
        df_a_couriers = df_index_couriers.loc[courier]
        mp_couriers_features[courier] = {}
        for idx in df_a_couriers.index:
            date = df_a_couriers.at[idx, 'date']
            mp_couriers_features[courier][date] = df_a_couriers.loc[idx]
    return mp_couriers_features

logging.info('start building couriers features')
mp_couriers_features_train = build_couriers_features(df_couriers_train, key_list_train)
mp_couriers_features_testA = build_couriers_features(df_couriers_testA, key_list_testA)


logging.info('finish building couriers features')

2021-01-19 16:27:24,040 - INFO - start delay feature build
2021-01-19 16:28:05,592 - INFO - finish delay feature build
2021-01-19 16:28:09,307 - INFO - start building couriers features
2021-01-19 16:28:16,256 - INFO - finish building couriers features


In [15]:
mp_couriers_features_train

{100681730: {Timestamp('2020-02-01 00:00:00'): courier_id                           100681730
  level                                        2
  speed                                  4.48985
  max_load                                     8
  date                       2020-02-01 00:00:00
  pickup_delay_rate                            1
  delivery_delay_rate                          0
  pickup_delay_time_avg                  452.571
  delivery_delay_time_avg                      0
  delivery_delay_count                         0
  pickup_delay_count                          21
  Name: 631, dtype: object},
 122220553: {Timestamp('2020-02-01 00:00:00'): courier_id                           122220553
  level                                        3
  speed                                  4.64289
  max_load                                    10
  date                       2020-02-01 00:00:00
  pickup_delay_rate                     0.346154
  delivery_delay_rate                          0

# 保存数据

In [16]:

pickle_path = './2_generate_train_test_courier_feature/mp_couriers_features_train.pickle'
with open(pickle_path, 'wb') as f:
    pickle.dump(mp_couriers_features_train, f)

pickle_path = './2_generate_train_test_courier_feature/mp_couriers_features_testA.pickle'
with open(pickle_path, 'wb') as f:
    pickle.dump(mp_couriers_features_testA, f)
    