In [26]:
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, auc
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
DataSet = pd.read_csv('train.csv', encoding = 'windows 1251', low_memory = False)

In [3]:
DataSet.head()

Unnamed: 0,Interval,Date,OrderDate,ClientID,ChannelID,OrderID,MaterialID,GroupID,Cluster,CancelFlag,OrderCnt,DeliveryType,prepay,count_edit
0,14-16.,03/10/2018,02/10/2018,93808186,2,90102063002,3328810.0,61.0,,0,1.0,Обычная доставка,0,1
1,14-16.,03/10/2018,02/10/2018,93808186,2,90102063002,3281258.0,30.0,,0,2.0,Обычная доставка,0,1
2,14-16.,03/10/2018,02/10/2018,93808186,2,90102063002,3210734.0,10.0,,0,1.0,Обычная доставка,0,1
3,14-16.,03/10/2018,02/10/2018,93808186,2,90102063002,3328848.0,61.0,,0,2.0,Обычная доставка,0,1
4,12-14.,07/10/2018,05/10/2018,94112406,2,90102091007,3347801.0,17.0,,0,10.0,Обычная доставка,0,1


## Feature Engineering

In [4]:
DataSet['DeliveryType'] = DataSet['DeliveryType'].map({'Обычная доставка': 0, 'Доставка День в День': 1})

In [5]:
SplitedInterval = DataSet['Interval'].str.split('-')

DataSet['FirstIntervalNumber'] = SplitedInterval.apply(lambda x: int(x[0]))
    
DataSet['SecondIntervalNumber'] = SplitedInterval.apply(lambda x: int(x[1][:-1]))

del DataSet['Interval']

In [6]:
DataSet['Date'] = pd.to_datetime(DataSet['Date'], format = '%d/%m/%Y')

DataSet['OrderDate'] = pd.to_datetime(DataSet['OrderDate'], format = '%d/%m/%Y')

In [7]:
DataSet['PlanMonth'] = DataSet['Date'].dt.month 

DataSet['PlanDay'] = DataSet['Date'].dt.day

DataSet['OrderMonth'] = DataSet['OrderDate'].dt.month 

DataSet['OrderDay'] = DataSet['OrderDate'].dt.day

DataSet['WeekDay'] = DataSet['Date'].dt.weekday

DataSet['OrderWeekDay'] = DataSet['OrderDate'].dt.weekday

DataSet['DeltaByDay'] = DataSet['PlanDay'] - DataSet['OrderDay']

In [8]:
DataSet['PlanMonth'] = DataSet['PlanMonth'].map({1: 1, 2:1, 3:2, 4:2, 5:2, 6:3, 7:3, 8:3, 9:4, 10:4, 11:4, 12:1})

In [9]:
DataSet['Season'] = DataSet['PlanMonth']

In [10]:
del DataSet['OrderMonth']

del DataSet['Date']

In [11]:
DataSet['Cluster'] = DataSet['Cluster'].map({
    
0: 3, 'HUB5': 1,
'HUB6': 2,'HUB17': 3,'HUB19': 4,'HUB1': 5,'HUB9' : 6,'HUB7': 7,'HUB22': 8,'HUB16': 9,'HUB8': 10,'HUB20': 11,
'HUB14': 12,'HUB2': 13,'HUB4': 14,'HUB21': 15,'HUB13': 16,'HUB10': 17,'HUB11': 18,'HUB3': 19,'HUB12': 20,'HUB18': 21,
'ZON1': 22,'HUB15': 23})

In [12]:
DataSet = DataSet.sort_values(by = ['ClientID', 'OrderDate', 'OrderID']) 

DataSet['NewFeature'] = DataSet[['ClientID', 'OrderDate']].duplicated(keep = 'last') 

DataSet['NewFeature'] = DataSet.NewFeature.apply(lambda x: 1 if (x == True) else 0) 

DataSet = DataSet.sort_values(by = ['OrderID'])

In [13]:
DataSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9023184 entries, 0 to 9023145
Data columns (total 22 columns):
OrderDate               datetime64[ns]
ClientID                int64
ChannelID               int64
OrderID                 int64
MaterialID              float64
GroupID                 float64
Cluster                 float64
CancelFlag              int64
OrderCnt                float64
DeliveryType            int64
prepay                  int64
count_edit              int64
FirstIntervalNumber     int64
SecondIntervalNumber    int64
PlanMonth               int64
PlanDay                 int64
OrderDay                int64
WeekDay                 int64
OrderWeekDay            int64
DeltaByDay              int64
Season                  int64
NewFeature              int64
dtypes: datetime64[ns](1), float64(4), int64(17)
memory usage: 1.5 GB


In [14]:
GropedDataSet = DataSet.groupby('OrderID')[DataSet.columns].first()

GropedDataSet['ClientByOrderCount'] = DataSet.groupby('OrderID')['ClientID'].count()

GropedDataSet['SumByOrderCnt'] = DataSet.groupby('OrderID')['OrderCnt'].sum()

GropedDataSet['SumByCountEdit'] = DataSet.groupby('OrderID')['count_edit'].sum()

In [15]:
GropedDataSet['DeltaCountEdit'] = GropedDataSet['ClientByOrderCount'] / GropedDataSet['SumByCountEdit']

GropedDataSet['DeltaOrderCnt'] = GropedDataSet['ClientByOrderCount'] / GropedDataSet['SumByOrderCnt']

In [16]:
GropedDataSet

Unnamed: 0_level_0,OrderDate,ClientID,ChannelID,OrderID,MaterialID,GroupID,Cluster,CancelFlag,OrderCnt,DeliveryType,...,WeekDay,OrderWeekDay,DeltaByDay,Season,NewFeature,ClientByOrderCount,SumByOrderCnt,SumByCountEdit,DeltaCountEdit,DeltaOrderCnt
OrderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90102063002,2018-10-02,93808186,2,90102063002,3328810.0,61.0,,0,1.0,0,...,2,1,1,4,1,4,6.0,4,1.0,0.666667
90102091007,2018-10-05,94112406,2,90102091007,3347802.0,17.0,,0,7.0,0,...,6,4,2,4,1,5,28.0,5,1.0,0.178571
90102092000,2018-10-05,93696397,2,90102092000,3332799.0,21.0,,0,1.0,1,...,4,4,0,4,1,10,16.0,10,1.0,0.625000
90102103017,2018-10-12,93696397,2,90102103017,3348064.0,63.0,,0,1.0,0,...,4,4,0,4,1,9,14.0,9,1.0,0.642857
90102104012,2018-10-15,93411902,2,90102104012,3325812.0,15.0,,1,3.0,0,...,2,0,2,4,1,26,29.0,26,1.0,0.896552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98357278690,2018-12-30,93324410,2,98357278690,3045286.0,36.0,1.0,0,1.0,0,...,0,6,1,1,1,21,37.0,21,1.0,0.567568
98357278797,2018-12-30,94273838,17,98357278797,3307484.0,20.0,1.0,0,4.0,0,...,6,6,0,1,0,1,4.0,1,1.0,0.250000
98357279051,2018-12-30,93899170,2,98357279051,3118728.0,32.0,1.0,0,1.0,0,...,0,6,1,1,1,34,73.0,34,1.0,0.465753
98357281094,2018-12-30,91690087,2,98357281094,2013262.0,36.0,1.0,0,1.0,0,...,0,6,1,1,1,35,48.0,35,1.0,0.729167


In [17]:
X = GropedDataSet.drop(['CancelFlag', 'OrderDate'], axis = 1)

Y = GropedDataSet['CancelFlag']

In [18]:
from sklearn.model_selection import train_test_split

XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size = 0.3, random_state = 101) # розбиваем на трейн и валидацию

In [19]:
X.fillna(-999, inplace = True)

In [20]:
X

Unnamed: 0_level_0,ClientID,ChannelID,OrderID,MaterialID,GroupID,Cluster,OrderCnt,DeliveryType,prepay,count_edit,...,WeekDay,OrderWeekDay,DeltaByDay,Season,NewFeature,ClientByOrderCount,SumByOrderCnt,SumByCountEdit,DeltaCountEdit,DeltaOrderCnt
OrderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90102063002,93808186,2,90102063002,3328810.0,61.0,-999.0,1.0,0,0,1,...,2,1,1,4,1,4,6.0,4,1.0,0.666667
90102091007,94112406,2,90102091007,3347802.0,17.0,-999.0,7.0,0,0,1,...,6,4,2,4,1,5,28.0,5,1.0,0.178571
90102092000,93696397,2,90102092000,3332799.0,21.0,-999.0,1.0,1,0,1,...,4,4,0,4,1,10,16.0,10,1.0,0.625000
90102103017,93696397,2,90102103017,3348064.0,63.0,-999.0,1.0,0,0,1,...,4,4,0,4,1,9,14.0,9,1.0,0.642857
90102104012,93411902,2,90102104012,3325812.0,15.0,-999.0,3.0,0,0,1,...,2,0,2,4,1,26,29.0,26,1.0,0.896552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98357278690,93324410,2,98357278690,3045286.0,36.0,1.0,1.0,0,0,1,...,0,6,1,1,1,21,37.0,21,1.0,0.567568
98357278797,94273838,17,98357278797,3307484.0,20.0,1.0,4.0,0,1,1,...,6,6,0,1,0,1,4.0,1,1.0,0.250000
98357279051,93899170,2,98357279051,3118728.0,32.0,1.0,1.0,0,0,1,...,0,6,1,1,1,34,73.0,34,1.0,0.465753
98357281094,91690087,2,98357281094,2013262.0,36.0,1.0,1.0,0,0,1,...,0,6,1,1,1,35,48.0,35,1.0,0.729167


## Fitting LightGBM

In [22]:
Parameters = {'boosting_type': 'gbdt', 'max_depth' : -1, 'objective': 'binary', 'num_leaves': 64, 'learning_rate': 0.05,
          
'max_bin': 512, 'subsample_for_bin': 200, 'subsample': 1, 'subsample_freq': 1, 'colsample_bytree': 0.8, 
          
'reg_alpha': 5, 'reg_lambda': 10, 'min_split_gain': 0.5, 'min_child_weight': 1, 'min_child_samples': 5,

'scale_pos_weight': 1, 'num_class' : 1, 'metric' : 'roc'}


HyperParameters = {'learning_rate': [0.005], 'n_estimators': [40], 'num_leaves': [6,8,12,16], 'boosting_type' : ['gbdt'],
    
'objective' : ['binary'], 'random_state' : [501], 'colsample_bytree' : [0.65, 0.66], 'subsample' : [0.7,0.75],

'reg_alpha' : [1,1.2], 'reg_lambda' : [1,1.2,1.4]}

Model = lgb.LGBMClassifier(boosting_type = 'gbdt', objective = 'binary', n_jobs = -1, silent = True, 
                         
max_depth = Parameters['max_depth'], max_bin = Parameters['max_bin'], 
                         
subsample_for_bin = Parameters['subsample_for_bin'],

subsample = Parameters['subsample'], subsample_freq = Parameters['subsample_freq'], 
                         
min_split_gain = Parameters['min_split_gain'], min_child_weight = Parameters['min_child_weight'],
                         
min_child_samples = Parameters['min_child_samples'], scale_pos_weight = Parameters['scale_pos_weight'])

Model.get_params().keys()

ModelTuning = GridSearchCV(Model, HyperParameters, verbose = 4, cv = 4, n_jobs = -1) 

ModelTuning.fit(XTrain, YTrain)

Parameters['colsample_bytree'] = ModelTuning.best_params_['colsample_bytree']

Parameters['learning_rate'] = ModelTuning.best_params_['learning_rate']

Parameters['num_leaves'] = ModelTuning.best_params_['num_leaves']

Parameters['reg_alpha'] = ModelTuning.best_params_['reg_alpha']

Parameters['reg_lambda'] = ModelTuning.best_params_['reg_lambda']

Parameters['subsample'] = ModelTuning.best_params_['subsample']

print(ModelTuning.best_params_)

print(ModelTuning.best_score_)

Fitting 4 folds for each of 96 candidates, totalling 384 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:  5.1min finished


{'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.005, 'n_estimators': 40, 'num_leaves': 6, 'objective': 'binary', 'random_state': 501, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}
0.9326918818816804


In [None]:
print('Fitting with parameters: \n', Parameters)

In [27]:
NumberOfTreesList = [100, 400, 800, 1200, 1600, 2200, 3000]

for i in NumberOfTreesList:

    gbm = lgb.train(Parameters, lgb.Dataset(XTrain, label = YTrain), i)

    YPredicted = gbm.predict(XTest)
    
    print('Number of trees:', i)

    print("ROC - AUC score: ", roc_auc_score(YTest, YPredicted), '\n')

Number of trees: 100
ROC - AUC score:  0.6800417562120258 

Number of trees: 400
ROC - AUC score:  0.6934493173748922 

Number of trees: 800
ROC - AUC score:  0.6995415502291344 

Number of trees: 1200
ROC - AUC score:  0.7026771195294661 

Number of trees: 1600
ROC - AUC score:  0.7048669347043837 

Number of trees: 2200
ROC - AUC score:  0.7070664553681385 

Number of trees: 3000
ROC - AUC score:  0.708858243570285 

