In [1]:
import numpy as np
import pandas as pd
import warnings
import os
from tqdm import tqdm
from sklearn import preprocessing, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
from sklearn.utils import shuffle

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
seed = 2020

In [3]:
df_feature = pd.read_pickle('./temp/part1_feature.plk')

In [4]:
# sub_columns = ['courier_id', 'wave_index', 'tracking_id',
#                'courier_wave_start_lng', 'courier_wave_start_lat', 'action_type', 'expect_time']

In [5]:
for f in df_feature.select_dtypes('object'):
    if f not in ['date', 'type']:
        print(f)
        lbl = LabelEncoder()
        df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

action_type
group
last_action_type
weather_grade
aoi_id
shop_id


In [6]:
df_feature.head()

Unnamed: 0,courier_id,wave_index,tracking_id,courier_wave_start_lng,courier_wave_start_lat,action_type,expect_time,date,type,target,group,id,current_time,last_tracking_id,last_action_type,source_lng,source_lat,target_lng,target_lat,grid_distance,weather_grade,aoi_id,shop_id,promise_deliver_time,estimate_pick_time,level,speed,max_load
0,10007871,0,2100074550065333539,121.630997,39.142343,0,1580528963,20200201,train,1.0,9,0,1580528622,2100074550065333539,1,121.631219,39.141811,121.632084,39.146201,707.0,2,19922,5653,1580530276,1580529019,3,4.751832,11
1,10007871,0,2100074550779577850,121.630997,39.142343,1,1580529129,20200201,train,0.0,9,1,1580528622,2100074550065333539,1,121.631219,39.141811,121.631574,39.142231,152.0,2,12666,6037,1580530236,1580529399,3,4.751832,11
2,10007871,0,2100074550779577850,121.630997,39.142343,0,1580529444,20200201,train,0.0,9,2,1580528622,2100074550065333539,1,121.631219,39.141811,121.635154,39.143561,671.0,2,12666,6037,1580530236,1580529399,3,4.751832,11
3,10007871,1,2100074555638285402,121.631208,39.142519,1,1580532225,20200201,train,1.0,10,3,1580532113,2100074554932692192,0,121.636904,39.142721,121.636701,39.141801,160.0,2,14953,1113,1580533463,1580532384,3,4.751832,11
4,10007871,1,2100074554118800474,121.631208,39.142519,1,1580532227,20200201,train,0.0,10,4,1580532113,2100074554932692192,0,121.636904,39.142721,121.636701,39.141801,160.0,2,1404,1113,1580533598,1580532339,3,4.751832,11


In [7]:
df_test = df_feature[df_feature['type'] == 'test'].copy()
df_train = df_feature[df_feature['type'] == 'train'].copy()
df_train = shuffle(df_train, random_state=seed)

In [8]:
ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id', 'wave_index', 'tracking_id', 'expect_time', 'date', 'type', 'group',
                               'courier_wave_start_lng', 'courier_wave_start_lat', 'shop_id', 'current_time_date'], df_train.columns))

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.1,
                           n_estimators=10000000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric=None
                           )


oof = []
prediction = df_test[['id', 'group']]
prediction['target'] = 0
df_importance_list = []

kfold = GroupKFold(n_splits=5)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol], df_train['group'])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof = df_train.iloc[val_idx][['id', 'group', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['target'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[303]	train's auc: 0.840105	valid's auc: 0.770636


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[310]	train's auc: 0.840489	valid's auc: 0.770277


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[287]	train's auc: 0.835554	valid's auc: 0.769252


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[346]	train's auc: 0.846797	valid's auc: 0.77001


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[244]	train's auc: 0.827138	valid's auc: 0.768532


In [9]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,aoi_id,2507.6
1,grid_distance,2468.6
2,target_lat,1594.6
3,speed,1542.8
4,current_time,1442.2
5,target_lng,1412.6
6,source_lng,1337.8
7,courier_id,1279.8
8,source_lat,1206.8
9,promise_deliver_time,1047.2


In [10]:
def wave_label_func(group):
    target_list = group['target'].values.tolist()
    pred_list = group['pred'].values.tolist()
    max_index = pred_list.index(max(pred_list))
    if target_list[max_index] == 1:
        return 1
    else:
        return 0

In [11]:
    df_oof = pd.concat(oof)
    df_temp = df_oof.groupby(['group']).apply(wave_label_func).reset_index()
    df_temp.columns = ['group', 'label']
    acc = df_temp[df_temp['label'] == 1].shape[0] / df_temp.shape[0]
    print('acc:', acc)

acc: 0.6243134050569703


In [12]:
def label_func(group):
    group = group.values.tolist()
    max_index = group.index(max(group))
    label = np.zeros(len(group))
    label[max_index] = 1
    return label


prediction['label'] = prediction.groupby(
    ['group'])['target'].transform(label_func)
sub_part1 = prediction[prediction['label'] == 1]
df_oof = df_oof[df_oof['target'] == 1]
next_action = pd.concat([df_oof[['id']], sub_part1[['id']]])
next_action.to_csv('./temp/next_action.csv', index=False)