In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier


from sklearn.externals import joblib


In [2]:
pathBase = '../user_data/tmp_data/'

In [3]:
pickTwo= pd.read_csv(pathBase+'pickTwo.txt') # 双可取单数据
pickThree = pd.read_csv(pathBase+'pickThree.txt') # 三可取单数据
deliveryTwo = pd.read_csv(pathBase+'deliveryTwo.txt') # 双可送单数据
deliveryThree = pd.read_csv(pathBase+'deliveryThree.txt') # 三可送单数据

In [4]:
# 划分训练集和测试集

pickTwo = shuffle(pickTwo)
pickThree = shuffle(pickThree)
deliveryTwo = shuffle(deliveryTwo)
deliveryThree = shuffle(deliveryThree)

train_size =0.9 # 10%的数据作为测试集

pickTwo_trainSize = int(len(pickTwo)*train_size)
pickThree_trainSize = int(len(pickThree)*train_size)
deliveryTwo_trainSize = int(len(deliveryTwo)*train_size)
deliveryThree_trainSize = int(len(deliveryThree)*train_size)

pickTwo_train = pickTwo.iloc[:pickTwo_trainSize,:] 
pickTwo_test = pickTwo.iloc[pickTwo_trainSize:,:]
pickThree_train = pickThree.iloc[:pickThree_trainSize,:]
pickThree_test =  pickThree.iloc[pickThree_trainSize:,:]
deliveryTwo_train = deliveryTwo.iloc[:deliveryTwo_trainSize,:]
deliveryTwo_test = deliveryTwo.iloc[deliveryTwo_trainSize:,:]
deliveryThree_train = deliveryThree.iloc[:deliveryThree_trainSize,:]
deliveryThree_test = deliveryThree.iloc[deliveryThree_trainSize:,:]

In [5]:
# 可选双订单数据根据距离差是否为0，化分为两类，之后分别进行建模
def DivideTwo(data):
    data1 = data.loc[data['1-2距离差']==0,:]
    data2 = data.loc[data['1-2距离差']!=0,:]
    
    return data1,data2

In [6]:
# 可选三订单数据根据三个订单之间的距离差是否都为0，还是部分为0划分为三类
def DivideThree(data):
    data1 = data.loc[(data['1-2距离差']==0)&(data['1-3距离差']==0)&(data['2-3距离差']==0),:] 
    data2 = data.loc[(data['1-2距离差']!=0)&(data['1-3距离差']!=0)&(data['2-3距离差']!=0),:]
    df1=data.append(data1)
    df2 = df1.append(data2)
    data3= df2.drop_duplicates(subset=['订单号', '订单1', '订单2','订单3','1-2距离差','1-3距离差','2-3距离差'],keep=False)
    
    return data1,data2,data3

In [7]:
# 细分数据类
pickTwo_train1,pickTwo_train2 = DivideTwo(pickTwo_train)
pickTwo_test1,pickTwo_test2 = DivideTwo(pickTwo_test)
pickThree_train1,pickThree_train2,pickThree_train3 = DivideThree(pickThree_train)
pickThree_test1,pickThree_test2,pickThree_test3 = DivideThree(pickThree_test)
deliveryTwo_train1,deliveryTwo_train2 = DivideTwo(deliveryTwo_train)
deliveryTwo_test1,deliveryTwo_test2 = DivideTwo(deliveryTwo_test)
_,deliveryThree_train2,deliveryThree_train3= DivideThree(deliveryThree_train) # 送单中3个订单距离差为0数量很少，因此不对此建模
_,deliveryThree_test2,deliveryThree_test3 = DivideThree(deliveryThree_test)

In [8]:
# 根据数据分析，双订单在距离差为x以上时，根据距离大小直接选择订单的准确率在95%以上，排除这部分数据，后续直接根据距离进行选单
def Narrow(data,x):
    data = data.loc[(data['1-2距离差']>=x)|(data['1-2距离差']<=-x),:]
    return data

In [9]:
# 距离差不为0的双候选单，排除一定距离差的数据
pickTwo_train2 = Narrow(pickTwo_train2,1500)
pickTwo_test2 = Narrow(pickTwo_test2,1500)
deliveryTwo_train2 = Narrow(deliveryTwo_train2,1000)
deliveryTwo_test2 = Narrow(deliveryTwo_train2,1000)

In [10]:
# 构建随机森林模型方法
def Model(train_data,test_data,columns,treeN,depth):
    x_train=train_data[columns]
    y_train = train_data['标签']
    x_test = test_data[columns]
    y_test = test_data['标签']
    
    model = RandomForestClassifier(n_estimators=treeN,max_depth=depth,random_state=0)
    model.fit(x_train,y_train)
    score_train=model.score(x_train,y_train)
    score_test = model.score(x_test,y_test)
    return model,score_train,score_test

In [30]:
# 双取单距离差为0模型
columns1 = ['1-2通知时间差','1-2可取单时间差','1-2承诺时间差']
model_pickTwo1,score_train,score_test = Model(pickTwo_train1,pickTwo_test1,columns1,100,3)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [31]:
joblib.dump(model_pickTwo1,'../user_data/model_data/choose_model/RFpickSame2.pkl')

In [32]:
# 双取单距离差不为0模型
columns1 = ['1-2理论时间差','1-2通知时间差','1-2可取单时间差','1-2承诺时间差']
model_pickTwo2,score_train,score_test = Model(pickTwo_train2,pickTwo_test2,columns1,100,5)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [33]:
joblib.dump(model_pickTwo2,'../user_data/model_data/choose_model/RFpick2.pkl')

In [34]:
# 三取单距离差为0
columns1 = ['1-2通知时间差','1-3通知时间差','2-3通知时间差','1-2可取单时间差','1-3可取单时间差','2-3可取单时间差','1-2承诺时间差','1-3承诺时间差','2-3承诺时间差']
model_pickThree1,score_train,score_test = Model(pickThree_train1,pickThree_test1,columns1,100,5)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [35]:
joblib.dump(model_pickThree1,'../user_data/model_data/choose_model/RFpickSame3.pkl')

In [36]:
# 三订单取单距离差都不为0的模型
columns1 = ['1-2理论时间差','1-3理论时间差','2-3理论时间差','1-2通知时间差','1-3通知时间差','2-3通知时间差','1-2可取单时间差','1-3可取单时间差','2-3可取单时间差','1-2承诺时间差','1-3承诺时间差','2-3承诺时间差']
model_pickThree2,score_train,score_test = Model(pickThree_train2,pickThree_test2,columns1,200,6)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [37]:
joblib.dump(model_pickThree2,'../user_data/model_data/choose_model/RFpick3.pkl')

In [38]:
# 三订单取单距离差部分为0的模型
columns1 = ['1-2理论时间差','1-3理论时间差','2-3理论时间差','1-2通知时间差','1-3通知时间差','2-3通知时间差','1-2可取单时间差','1-3可取单时间差','2-3可取单时间差','1-2承诺时间差','1-3承诺时间差','2-3承诺时间差']
model_pickThree3,score_train,score_test = Model(pickThree_train3,pickThree_test3,columns1,200,7)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [39]:
joblib.dump(model_pickThree3,'../user_data/model_data/choose_model/RFpickpartSame3.pkl')

In [40]:
# 双订单送单距离差为0的模型
columns1 = ['1-2通知时间差','1-2承诺时间差']
model_deliveryTwo1,score_train,score_test = Model(deliveryTwo_train1,deliveryTwo_test1,columns1,100,3)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [41]:
joblib.dump(model_deliveryTwo1,'../user_data/model_data/choose_model/RFDESame2.pkl')

In [42]:
# 双订单送单距离差不为0的模型
columns1 = ['1-2理论时间差','1-2通知时间差','1-2可取单时间差','1-2承诺时间差']
model_deliveryTwo2,score_train,score_test = Model(deliveryTwo_train2,deliveryTwo_test2,columns1,100,5)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [43]:
joblib.dump(model_deliveryTwo2,'../user_data/model_data/choose_model/RFDE2.pkl')

In [44]:
# # 三订单取单距离差都不为0的模型
columns1 = ['1-2理论时间差','1-3理论时间差','2-3理论时间差','1-2承诺时间差','1-3承诺时间差','2-3承诺时间差']
model_deliveryThree2,score_train,score_test = Model(deliveryThree_train2,deliveryThree_test2,columns1,200,8)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [45]:
joblib.dump(model_deliveryThree2,'../user_data/model_data/choose_model/RFDE3.pkl')

In [46]:
# # 三订单送单部分距离差为0的模型
columns1 = ['1-2理论时间差','1-3理论时间差','2-3理论时间差','1-2通知时间差','1-3通知时间差','2-3通知时间差','1-2可取单时间差','1-3可取单时间差','2-3可取单时间差','1-2承诺时间差','1-3承诺时间差','2-3承诺时间差']
model_deliveryThree3,score_train,score_test = Model(deliveryThree_train3,deliveryThree_test3,columns1,200,4)
print('train:%.4f,test:%.4f'%(score_train,score_test))

In [47]:
joblib.dump(model_deliveryThree3,'../user_data/model_data/choose_model/RFDEpartSame3.pkl')