In [1]:
import numpy as np
import pandas as pd
import warnings
import os
from tqdm import tqdm
from sklearn import preprocessing, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
from sklearn.utils import shuffle

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
seed = 2020

In [3]:
df_feature = pd.read_pickle('./temp/part2_feature.plk')

In [4]:
df_feature.head()

Unnamed: 0,id,courier_id,wave_index,tracking_id,courier_wave_start_lng,courier_wave_start_lat,action_type,expect_time,date,type,target,group
0,98263,10330725,9,2100075923187730175,121.481429,39.299365,PICKUP,1582888651,20200228,train,1.0,20200228103307259
1,116276,10053442,1,2100075314534647435,121.479587,39.248115,PICKUP,1582084880,20200219,train,1.0,20200219100534421
2,153284,118787313,4,2100075078536791439,121.440498,39.203471,DELIVERY,1581671584,20200214,train,1.0,202002141187873134
3,16134,116706233,3,2100074825841346124,121.54301,39.258822,DELIVERY,1581075896,20200207,train,1.0,202002071167062333
4,23009,118333873,5,2100074746653279446,121.406669,39.364738,PICKUP,1580906387,20200205,train,1.0,202002051183338735


In [5]:
df_test = df_feature[df_feature['type'] == 'test'].copy()
df_train = df_feature[df_feature['type'] == 'train'].copy()

In [6]:
prediction = df_test[['courier_id', 'wave_index', 'tracking_id',
                      'courier_wave_start_lng', 'courier_wave_start_lat', 'action_type', 'expect_time', 'date']]
prediction['expect_time'] = 0

In [7]:
for f in df_feature.select_dtypes('object'):
    if f not in ['date', 'type']:
        print(f)
        lbl = LabelEncoder()
        lbl = lbl.fit(df_train[f].astype(
            str).values.tolist()+df_test[f].astype(str).values.tolist())
        df_train[f] = lbl.transform(df_train[f].astype(str))
        df_test[f] = lbl.transform(df_test[f].astype(str))

action_type
group


In [8]:
ycol = 'expect_time'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id', 'wave_index', 'tracking_id', 'target', 'date', 'type', 'group',
                               'courier_wave_start_lng', 'courier_wave_start_lat'], df_train.columns))

model = lgb.LGBMRegressor(num_leaves=64,
                          max_depth=10,
                          learning_rate=0.1,
                          n_estimators=10000000,
                          subsample=0.8,
                          feature_fraction=0.8,
                          reg_alpha=0.5,
                          reg_lambda=0.5,
                          random_state=seed,
                          metric=None
                          )


oof = []
df_importance_list = []

kfold = GroupKFold(n_splits=5)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol], df_train['group'])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='mae',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = df_train.iloc[val_idx][['id', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction['expect_time'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[59]	train's l1: 609530	valid's l1: 614571


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[88]	train's l1: 609419	valid's l1: 613779


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[99]	train's l1: 608459	valid's l1: 613229


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[203]	train's l1: 607344	valid's l1: 611899


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[203]	train's l1: 608302	valid's l1: 611439


In [9]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,courier_id,5506.6
1,action_type,1916.4


In [10]:
df_oof = pd.concat(oof)
mae = metrics.mean_absolute_error(df_oof[ycol], df_oof['pred'])
print('mae:', mae)

mae: 612983.2239464395


In [11]:
prediction.head()

Unnamed: 0,courier_id,wave_index,tracking_id,courier_wave_start_lng,courier_wave_start_lat,action_type,expect_time,date
44058,10012508,1,2100076472664917076,121.549495,39.289891,DELIVERY,1581864000.0,20200306
44059,10012508,5,2100076518544770291,121.529386,39.297263,DELIVERY,1581864000.0,20200306
44060,10021791,1,2100076474713671215,121.451866,39.219517,PICKUP,1581948000.0,20200306
44061,10037225,1,2100076471997432842,121.485061,39.150327,PICKUP,1581948000.0,20200306
44062,10037225,3,2100076506867894139,121.525111,39.151175,DELIVERY,1581911000.0,20200306


In [12]:
import zipfile
os.makedirs('./sub/{}'.format(int(mae)), exist_ok=True)
f = zipfile.ZipFile('./sub/{}.zip'.format(int(mae)), 'w', zipfile.ZIP_DEFLATED)
for date in prediction['date'].unique():
    df_temp = prediction[prediction['date'] == date]
    del df_temp['date']
    df_temp.to_csv('./sub/{}/action_{}.txt'.format(int(mae), date), index=False)
    f.write('./sub/{}/action_{}.txt'.format(int(mae), date), 'action_{}.txt'.format(date))
f.close()

In [13]:
'''
20200306 639
20200303 804
20200302 582
20200304 552
20200305 793
20200301 849
'''

'\n20200306 639\n20200303 804\n20200302 582\n20200304 552\n20200305 793\n20200301 849\n'