In [1]:
import numpy as np
import pandas as pd
import warnings
import os
from tqdm import tqdm
from sklearn import preprocessing, metrics
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
df_history_action = pd.read_pickle('./temp/action_history.plk')
df_feature = pd.read_pickle('./temp/base_feature.plk')
df_courier = pd.read_pickle('./temp/courier.plk')
df_order = pd.read_pickle('./temp/order.plk')
df_distance = pd.read_pickle('./temp/distance.plk')

In [3]:
seed = 2020

# 历史订单信息

In [4]:
df_history_action.head()

Unnamed: 0,courier_id,wave_index,tracking_id,courier_wave_start_lng,courier_wave_start_lat,action_type,expect_time,date,type,group
120,10007871,0,2100074548854622111,121.630997,39.142343,PICKUP,1580527779,20200201,train,20200201100078710
121,10007871,0,2100074548854622111,121.630997,39.142343,DELIVERY,1580528077,20200201,train,20200201100078710
122,10007871,0,2100074550065333539,121.630997,39.142343,PICKUP,1580528622,20200201,train,20200201100078710
126,10007871,1,2100074553896437081,121.631208,39.142519,PICKUP,1580530391,20200201,train,20200201100078711
127,10007871,1,2100074553896437081,121.631208,39.142519,DELIVERY,1580531150,20200201,train,20200201100078711


In [5]:
# 获取 wave 最后一次 step 信息
df_temp = df_history_action.groupby(['group'])['expect_time'].apply(
    lambda x: x.values.tolist()[-1]).reset_index()
df_temp.columns = ['group', 'current_time']
df_feature = df_feature.merge(df_temp, how='left')

df_temp = df_history_action.groupby(['group'])['tracking_id'].apply(
    lambda x: x.values.tolist()[-1]).reset_index()
df_temp.columns = ['group', 'last_tracking_id']
df_feature = df_feature.merge(df_temp, how='left')

df_temp = df_history_action.groupby(['group'])['action_type'].apply(
    lambda x: x.values.tolist()[-1]).reset_index()
df_temp.columns = ['group', 'last_action_type']
df_feature = df_feature.merge(df_temp, how='left')

# distance 表相关特征

In [6]:
df_distance.head()

Unnamed: 0,courier_id,wave_index,tracking_id,source_type,source_lng,source_lat,target_tracking_id,target_type,target_lng,target_lat,grid_distance,date,group
0,100002543,0,2100074683194934900,ASSIGN,121.62614,39.134413,2100074669847643396,ASSIGN,121.62388,39.133604,211.0,20200204,202002041000025430
1,100002543,0,2100074683194934900,ASSIGN,121.62614,39.134413,2100074669847643396,PICKUP,121.62723,39.134409,118.0,20200204,202002041000025430
2,100002543,0,2100074683194934900,ASSIGN,121.62614,39.134413,2100074683510555680,PICKUP,121.627417,39.133745,421.0,20200204,202002041000025430
3,100002543,0,2100074683194934900,ASSIGN,121.62614,39.134413,2100074669847643396,DELIVERY,121.648664,39.138861,3015.0,20200204,202002041000025430
4,100002543,0,2100074683194934900,ASSIGN,121.62614,39.134413,2100074683510555680,ASSIGN,121.62523,39.133738,205.0,20200204,202002041000025430


In [7]:
df_distance = df_distance.rename(columns={'tracking_id': 'last_tracking_id',
                                          'source_type': 'last_action_type', 'target_tracking_id': 'tracking_id', 'target_type': 'action_type'})
df_feature = df_feature.merge(df_distance.drop(
    ['courier_id', 'wave_index', 'date'], axis=1), how='left')

In [8]:
df_feature.head()

Unnamed: 0,courier_id,wave_index,tracking_id,courier_wave_start_lng,courier_wave_start_lat,action_type,expect_time,date,type,target,group,id,current_time,last_tracking_id,last_action_type,source_lng,source_lat,target_lng,target_lat,grid_distance
0,10007871,0,2100074550065333539,121.630997,39.142343,DELIVERY,1580528963,20200201,train,1.0,20200201100078710,0,1580528622,2100074550065333539,PICKUP,121.631219,39.141811,121.632084,39.146201,707.0
1,10007871,0,2100074550779577850,121.630997,39.142343,PICKUP,1580529129,20200201,train,0.0,20200201100078710,1,1580528622,2100074550065333539,PICKUP,121.631219,39.141811,121.631574,39.142231,152.0
2,10007871,0,2100074550779577850,121.630997,39.142343,DELIVERY,1580529444,20200201,train,0.0,20200201100078710,2,1580528622,2100074550065333539,PICKUP,121.631219,39.141811,121.635154,39.143561,671.0
3,10007871,1,2100074555638285402,121.631208,39.142519,PICKUP,1580532225,20200201,train,1.0,20200201100078711,3,1580532113,2100074554932692192,DELIVERY,121.636904,39.142721,121.636701,39.141801,160.0
4,10007871,1,2100074554118800474,121.631208,39.142519,PICKUP,1580532227,20200201,train,0.0,20200201100078711,4,1580532113,2100074554932692192,DELIVERY,121.636904,39.142721,121.636701,39.141801,160.0


# order 表相关特征

In [9]:
df_order.head()

Unnamed: 0,courier_id,wave_index,tracking_id,weather_grade,pick_lng,pick_lat,deliver_lng,deliver_lat,create_time,confirm_time,assigned_time,promise_deliver_time,estimate_pick_time,aoi_id,shop_id,date
0,100002543,0,2100075423059314102,正常天气,121.631386,39.134184,121.621114,39.133431,1582256837,1582257379,1582257409,1582259597,1582258457,3f3df1b8862af65746bb49609eeb57c7,5c9c9ca9271ff717255d2068c1bb79ed,20200221
1,100002543,0,2100075422789371468,正常天气,121.629854,39.134711,121.607654,39.128001,1582257790,1582257791,1582257830,1582260310,1582258691,427956eae5d8bf0cda42593f3ac4d8fc,18e2305e99dbc89aaec8dac3d89c5a85,20200221
2,100002652,0,2100075415175234624,正常天气,121.566356,39.149935,121.548444,39.145661,1582252549,1582252550,1582252569,1582255249,1582253990,4670fa58c5d3942faf437a008b4cc934,adfe4aafa9da1cc27b7762cc082be000,20200221
3,100002652,0,2100075416130749977,正常天气,121.54995,39.150131,121.545994,39.146361,1582254125,1582254126,1582254190,1582257365,1582255206,ad8222e53243dd2a7f9440278ca6a5f9,efe575b81c922d8cc0e1da84d1639ccc,20200221
4,100002652,0,2100075418091685090,正常天气,121.549914,39.150209,121.545834,39.150411,1582254391,1582254402,1582254430,1582256191,1582255122,f56e126110ecda18f474a71a2b2a8604,c1602f198a89acf920c147f98b17bad9,20200221


In [10]:
df_feature = df_feature.merge(
    df_order[['tracking_id', 'weather_grade', 'aoi_id', 'shop_id', 'promise_deliver_time',
              'estimate_pick_time']], how='left')

In [11]:
df_feature.head()

Unnamed: 0,courier_id,wave_index,tracking_id,courier_wave_start_lng,courier_wave_start_lat,action_type,expect_time,date,type,target,group,id,current_time,last_tracking_id,last_action_type,source_lng,source_lat,target_lng,target_lat,grid_distance,weather_grade,aoi_id,shop_id,promise_deliver_time,estimate_pick_time
0,10007871,0,2100074550065333539,121.630997,39.142343,DELIVERY,1580528963,20200201,train,1.0,20200201100078710,0,1580528622,2100074550065333539,PICKUP,121.631219,39.141811,121.632084,39.146201,707.0,正常天气,b71df7214347524a0f5f0c79dfdf2f4e,88ac051764fe348382e6529948de8015,1580530276,1580529019
1,10007871,0,2100074550779577850,121.630997,39.142343,PICKUP,1580529129,20200201,train,0.0,20200201100078710,1,1580528622,2100074550065333539,PICKUP,121.631219,39.141811,121.631574,39.142231,152.0,正常天气,753c1911e8a294c5db901f8555faff0c,92ec52685bd511da262ee6f7a0adaa87,1580530236,1580529399
2,10007871,0,2100074550779577850,121.630997,39.142343,DELIVERY,1580529444,20200201,train,0.0,20200201100078710,2,1580528622,2100074550065333539,PICKUP,121.631219,39.141811,121.635154,39.143561,671.0,正常天气,753c1911e8a294c5db901f8555faff0c,92ec52685bd511da262ee6f7a0adaa87,1580530236,1580529399
3,10007871,1,2100074555638285402,121.631208,39.142519,PICKUP,1580532225,20200201,train,1.0,20200201100078711,3,1580532113,2100074554932692192,DELIVERY,121.636904,39.142721,121.636701,39.141801,160.0,正常天气,898aef0932f6aaecda27aba8e9903991,1af41a72adbdb2abd7a2dab03e357bcf,1580533463,1580532384
4,10007871,1,2100074554118800474,121.631208,39.142519,PICKUP,1580532227,20200201,train,0.0,20200201100078711,4,1580532113,2100074554932692192,DELIVERY,121.636904,39.142721,121.636701,39.141801,160.0,正常天气,0d109cc0999fa3a108c67cb748b1931f,1af41a72adbdb2abd7a2dab03e357bcf,1580533598,1580532339


# courier 表相关特征

In [12]:
df_feature = df_feature.merge(df_courier, how='left')

In [13]:
df_feature.to_pickle('./temp/part1_feature.plk')

In [14]:
df_feature.shape

(223104, 28)