In [1]:
import numpy as np
import pandas as pd
import warnings
import os
from tqdm import tqdm
from sklearn import preprocessing, metrics
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
train_path = './dataset/eleme_round1_train_20200313'
test_path = './dataset/eleme_round1_testB_20200413'

In [3]:
courier_list = []
# courier 数据
for f in os.listdir(os.path.join(train_path, 'courier')):
    date = f.split('.')[0].split('_')[1]
    df = pd.read_csv(os.path.join(train_path, 'courier', f))
    df['date'] = date
    courier_list.append(df)

for f in os.listdir(os.path.join(test_path, 'courier')):
    date = f.split('.')[0].split('_')[1]
    df = pd.read_csv(os.path.join(test_path, 'courier', f))
    df['date'] = date
    courier_list.append(df)

df_courier = pd.concat(courier_list, sort=False)
df_courier.to_pickle('./temp/courier.plk')

In [4]:
order_list = []
# order 数据
for f in os.listdir(os.path.join(train_path, 'order')):
    date = f.split('.')[0].split('_')[1]
    df = pd.read_csv(os.path.join(train_path, 'order', f))
    df['date'] = date
    order_list.append(df)

for f in os.listdir(os.path.join(test_path, 'order')):
    date = f.split('.')[0].split('_')[1]
    df = pd.read_csv(os.path.join(test_path, 'order', f))
    df['date'] = date
    order_list.append(df)

df_order = pd.concat(order_list, sort=False)
df_order.to_pickle('./temp/order.plk')

In [5]:
distance_list = []
# distance 数据
for f in os.listdir(os.path.join(train_path, 'distance')):
    date = f.split('.')[0].split('_')[1]
    df = pd.read_csv(os.path.join(train_path, 'distance', f))
    df['date'] = date
    distance_list.append(df)

for f in os.listdir(os.path.join(test_path, 'distance')):
    date = f.split('.')[0].split('_')[1]
    df = pd.read_csv(os.path.join(test_path, 'distance', f))
    df['date'] = date
    distance_list.append(df)

df_distance = pd.concat(distance_list, sort=False)
df_distance['group'] = df_distance['date'].astype(
    'str') + df_distance['courier_id'].astype('str') + df_distance['wave_index'].astype('str')
df_distance.to_pickle('./temp/distance.plk')

In [6]:
# 后 55% 为待预测行为
ratio = 0.5


def read_feat(df):
    label_list = []
    history_list = []
    type = df['type'].values[0]

    # 划分数据集
    groups = df.groupby(['courier_id', 'wave_index'])
    for name, group in tqdm(groups):
        if type == 'train':
            label_data = group.tail(int(group.shape[0] * ratio))
            history_data = group.drop(label_data.index)

            if label_data.shape[0] < 3:
                continue
            else:
                # 第一个数据为正样本，其余为负样本
                label_data['target'] = 0
                label_data.reset_index(drop=True, inplace=True)
                label_data.loc[0, 'target'] = 1
                label_list.append(label_data)
                history_list.append(history_data)
        else:
            label_data = group[group['expect_time'] == 0]
            history_data = group.drop(label_data.index)

            label_data['target'] = None
            label_list.append(label_data)
            history_list.append(history_data)

    return pd.concat(label_list, sort=False), pd.concat(history_list, sort=False)

In [7]:
df_actions = []
for f in os.listdir(os.path.join(train_path, 'action')):
    date = f.split('.')[0].split('_')[1]
    df = pd.read_csv(os.path.join(train_path, 'action', f))
    df['date'] = date
    df['type'] = 'train'
    df_actions.append(df)

for f in os.listdir(os.path.join(test_path, 'action')):
    date = f.split('.')[0].split('_')[1]
    df = pd.read_csv(os.path.join(test_path, 'action', f))
    df['date'] = date
    df['type'] = 'test'
    df_actions.append(df)

res = Parallel(n_jobs=12)(delayed(read_feat)(df) for df in tqdm(df_actions))
df_feature = [item[0] for item in res]
df_history = [item[1] for item in res]

df_feature = pd.concat(df_feature, sort=False)
df_history = pd.concat(df_history, sort=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:24<00:00,  1.44it/s]


In [9]:
df_feature['group'] = df_feature['date'].astype(
    'str') + df_feature['courier_id'].astype('str') + df_feature['wave_index'].astype('str')
df_history['group'] = df_history['date'].astype(
    'str') + df_history['courier_id'].astype('str') + df_history['wave_index'].astype('str')
df_feature['target'] = df_feature['target'].astype('float')
df_feature['id'] = range(df_feature.shape[0])

In [10]:
df_feature.head()

Unnamed: 0,courier_id,wave_index,tracking_id,courier_wave_start_lng,courier_wave_start_lat,action_type,expect_time,date,type,target,group,id
0,10007871,0,2100074550065333539,121.630997,39.142343,DELIVERY,1580528963,20200201,train,1.0,20200201100078710,0
1,10007871,0,2100074550779577850,121.630997,39.142343,PICKUP,1580529129,20200201,train,0.0,20200201100078710,1
2,10007871,0,2100074550779577850,121.630997,39.142343,DELIVERY,1580529444,20200201,train,0.0,20200201100078710,2
0,10007871,1,2100074555638285402,121.631208,39.142519,PICKUP,1580532225,20200201,train,1.0,20200201100078711,3
1,10007871,1,2100074554118800474,121.631208,39.142519,PICKUP,1580532227,20200201,train,0.0,20200201100078711,4


In [11]:
df_feature.shape

(221396, 12)

In [12]:
df_history.to_pickle('./temp/action_history.plk')
df_feature.to_pickle('./temp/base_feature.plk')