In [22]:
import datetime
import pandas as pd
from IPython.display import display, HTML
def show_df(df):
    display(HTML(df.to_html()))

# 读取数据

In [23]:
def read_a_day(dataset_path, read_date):
    df_action = pd.read_csv('%saction/action_%s.txt' % (dataset_path, read_date.strftime('%Y%m%d')))
    df_courier = pd.read_csv('%scourier/courier_%s.txt' % (dataset_path, read_date.strftime('%Y%m%d')))
    df_distance = pd.read_csv('%sdistance/distance_%s.txt' % (dataset_path, read_date.strftime('%Y%m%d')))
    df_order = pd.read_csv('%sorder/order_%s.txt' % (dataset_path, read_date.strftime('%Y%m%d')))
    return df_action, df_courier, df_distance, df_order


def read_days(start_date, end_date, data_dir):
    cur_date = start_date
    actions = []
    couriers = []
    distances = []
    orders = []
    while cur_date < end_date:
        df_action, df_courier, df_distance, df_order = read_a_day(data_dir, cur_date)
        df_action['date'] = cur_date
        df_courier['date'] = cur_date
        df_distance['date'] = cur_date
        df_order['date'] = cur_date

        actions.append(df_action)
        couriers.append(df_courier)
        distances.append(df_distance)
        orders.append(df_order)
        cur_date += datetime.timedelta(days=1)

    df_actions = pd.concat(actions, axis=0, ignore_index=True)
    df_couriers = pd.concat(couriers, axis=0, ignore_index=True)
    df_distances = pd.concat(distances, axis=0, ignore_index=True)
    df_orders = pd.concat(orders, axis=0, ignore_index=True)
    return df_actions, df_couriers, df_distances, df_orders

In [24]:
train_dataset_path = './eleme_round1_train/'
test_dataset_path = './eleme_round1_testA/'
df_actions_train, df_couriers_train, df_distances_train, df_orders_train = read_days(datetime.datetime(2020, 2, 1),
                                                                                     datetime.datetime(2020, 2, 2),
                                                                                     train_dataset_path)
df_actions_testA, df_couriers_testA, df_distances_testA, df_orders_testA = read_days(datetime.datetime(2020, 3, 1),
                                                                                     datetime.datetime(2020, 3, 2),
                                                                                     test_dataset_path)

# 把assign加入action 删除经纬度

In [25]:
def add_assign(df_a_order,df_a_action):
    assign_df = df_a_order.loc[:,['courier_id', 'wave_index', 'tracking_id', 'assigned_time','date']]
    assign_df.rename(columns={'assigned_time': 'expect_time'}, inplace=True)
    assign_df['action_type'] = 'ASSIGN'
    del df_a_action['courier_wave_start_lng'], df_a_action['courier_wave_start_lat']
    df_a_action=pd.concat([df_a_action, assign_df], sort=True)
    return df_a_action
def sort_row(df):
    df=df.sort_values(['date','courier_id','wave_index']).reset_index(drop=True)
    return df

df_actions_train=add_assign(df_orders_train,df_actions_train)
df_actions_train=df_actions_train.sort_values(['date','courier_id','wave_index','expect_time']).reset_index(drop=True)
df_distances_train=sort_row(df_distances_train)
df_orders_train=sort_row(df_orders_train)

df_actions_testA=add_assign(df_orders_testA,df_actions_testA)
df_actions_testA=sort_row(df_actions_testA)
df_distances_testA=sort_row(df_distances_testA)
df_orders_testA=sort_row(df_orders_testA)

show_df(df_couriers_testA.head())

Unnamed: 0,courier_id,level,speed,max_load,date
0,10513697,3,5.29532,9,2020-03-01
1,10656566,3,5.630058,10,2020-03-01
2,10711180,3,6.223964,8,2020-03-01
3,107672506,2,6.010422,6,2020-03-01
4,10788423,3,6.084984,9,2020-03-01


# 将数据映射成字典

In [94]:
def get_map(df):
    """
    将数据映射成字典，可通过
        mp[date][courier_id][wave_index]
    访问

    """
    last_date, last_courier, last_wave = df.at[0, 'date'], df.at[0, 'courier_id'], df.at[0, 'wave_index']
    print(last_date, last_courier, last_wave)
    last_i = 0
    mp_date, mp_courier, mp_wave = {}, {}, {}
    key_list = []
    for i in range(1, df.shape[0]):
        cur_wave = df.at[i, 'wave_index']
        cur_courier = df.at[i, 'courier_id']
        cur_date = df.at[i, 'date']

        if last_wave != cur_wave or (last_wave == cur_wave and last_courier != cur_courier):
            mp_wave[last_wave] = df.iloc[last_i: i]
            last_i = i
            key_list.append([last_date, last_courier, last_wave])
            last_wave = cur_wave

        if last_courier != cur_courier or (last_courier == cur_courier and last_date != cur_date):
            mp_courier[last_courier] = mp_wave.copy()
            mp_wave = {}
            last_courier = cur_courier

        if last_date != cur_date:
            mp_date[last_date] = mp_courier.copy()
            mp_courier = {}
            last_date = cur_date

    mp_wave[last_wave] = df.iloc[last_i:]
    key_list.append([last_date, last_courier, last_wave])
    mp_courier[last_courier] = mp_wave.copy()
    mp_date[last_date] = mp_courier.copy()
    mp_courier = {}

    return mp_date, key_list


def get_map_distance_detail(mp_distance, key_list):
    mp_distance_detail = {}
    for (date, courier, wave_idx) in key_list:
        if date not in mp_distance_detail:
            mp_distance_detail[date] = {}
        if courier not in mp_distance_detail[date]:
            mp_distance_detail[date][courier] = {}
        if wave_idx not in mp_distance_detail[date][courier]:
            mp_distance_detail[date][courier][wave_idx] = {}

        df_a_distance = mp_distance[date][courier][wave_idx]

        gby_tracking = df_a_distance.groupby('tracking_id')
        for tracking_id, df_track_id in gby_tracking:
            mp_distance_detail[date][courier][wave_idx][tracking_id] = {}
            gby_target_tracking = df_track_id.groupby('target_tracking_id')
            for target_trackingid, df_target_tracking in gby_target_tracking:
                mp_distance_detail[date][courier][wave_idx][tracking_id][target_trackingid] = df_target_tracking

    return mp_distance_detail

In [95]:
#train
mp_action_train, action_key_list_train = get_map(df_actions_train)
mp_distance_train, distance_key_list_train = get_map(df_distances_train)
mp_order_train, order_key_list_train = get_map(df_orders_train)
#test
mp_action_testA, action_key_list_testA = get_map(df_actions_testA)
mp_distance_testA, distance_key_list_testA = get_map(df_distances_testA)
mp_order_testA, order_key_list_testA = get_map(df_orders_testA)

2020-02-01 00:00:00 10007871 0
2020-02-01 00:00:00 10007871 0
2020-02-01 00:00:00 10007871 0
2020-03-01 00:00:00 10007871 0
2020-03-01 00:00:00 10007871 0
2020-03-01 00:00:00 10007871 0


In [96]:
mp_distance_detail_train = get_map_distance_detail(mp_distance_train, action_key_list_train)
mp_distance_detail_testA = get_map_distance_detail(mp_distance_testA, action_key_list_testA)

In [97]:
date = datetime.datetime(2020, 3, 1)
mp_distance_detail_testA[date][123679329][0]

{2100076032706380918: {2100076032706380918:         courier_id  wave_index          tracking_id source_type  source_lng  \
  150333   123679329           0  2100076032706380918      ASSIGN  121.568537   
  150334   123679329           0  2100076032706380918      ASSIGN  121.568537   
  150340   123679329           0  2100076032706380918      PICKUP  121.569744   
  150343   123679329           0  2100076032706380918      PICKUP  121.569744   
  150360   123679329           0  2100076032706380918    DELIVERY  121.565254   
  150361   123679329           0  2100076032706380918    DELIVERY  121.565254   
  
          source_lat   target_tracking_id target_type  target_lng  target_lat  \
  150333   39.149642  2100076032706380918    DELIVERY  121.565254   39.144061   
  150334   39.149642  2100076032706380918      PICKUP  121.569744   39.150219   
  150340   39.150219  2100076032706380918      ASSIGN  121.568537   39.149642   
  150343   39.150219  2100076032706380918    DELIVERY  121.56525

In [80]:
date = datetime.datetime(2020, 3, 1)
mp_distance_detail_testA[date][123679329][0]

{2100076032706380918: {2100076032706380918:         courier_id  wave_index          tracking_id source_type  source_lng  \
  150333   123679329           0  2100076032706380918      ASSIGN  121.568537   
  150334   123679329           0  2100076032706380918      ASSIGN  121.568537   
  150340   123679329           0  2100076032706380918      PICKUP  121.569744   
  150343   123679329           0  2100076032706380918      PICKUP  121.569744   
  150360   123679329           0  2100076032706380918    DELIVERY  121.565254   
  150361   123679329           0  2100076032706380918    DELIVERY  121.565254   
  
          source_lat   target_tracking_id target_type  target_lng  target_lat  \
  150333   39.149642  2100076032706380918    DELIVERY  121.565254   39.144061   
  150334   39.149642  2100076032706380918      PICKUP  121.569744   39.150219   
  150340   39.150219  2100076032706380918      ASSIGN  121.568537   39.149642   
  150343   39.150219  2100076032706380918    DELIVERY  121.56525

# 保存变量

In [98]:
import pickle
PICKLE_PATH = './0_data_sort_out'

In [99]:

train_dataset_mp = {
    'key_list' : action_key_list_train,
    'mp_action' : mp_action_train,
    'mp_distance' : mp_distance_train,
    'mp_order' : mp_order_train,
    'mp_distance_detail' : mp_distance_detail_train
}
with open('%s/train_dataset_mp.pickle'%(PICKLE_PATH), 'wb') as f:
    pickle.dump(train_dataset_mp, f)


# In[ ]:


testA_dataset_mp = {
    'key_list' : action_key_list_testA,
    'mp_action' : mp_action_testA,
    'mp_distance' : mp_distance_testA,
    'mp_order' : mp_order_testA,
    'mp_distance_detail' : mp_distance_detail_testA
}
with open('%s/testA_dataset_mp.pickle'%(PICKLE_PATH), 'wb') as f:
    pickle.dump(testA_dataset_mp, f)


# In[ ]:


train_dataset_df = {
    'df_actions' : df_actions_train,
    'df_couriers' : df_couriers_train,
    'df_distances' : df_distances_train,
    'df_orders' : df_orders_train
}
with open('%s/train_dataset_df.pickle'%(PICKLE_PATH), 'wb') as f:
    pickle.dump(train_dataset_df, f)


# In[ ]:


testA_dataset_df = {
    'df_actions' : df_actions_testA,
    'df_couriers' : df_couriers_testA,
    'df_distances' : df_distances_testA,
    'df_orders' : df_orders_testA
}
with open('%s/testA_dataset_df.pickle'%(PICKLE_PATH), 'wb') as f:
    pickle.dump(testA_dataset_df, f)


# In[ ]:



