In [2]:
import numpy as np
import pandas as pd
import os
import copy
import warnings
warnings.filterwarnings('ignore')
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier


from sklearn.externals import joblib
pd.set_option('display.max_columns', None)


In [3]:
data = pd.read_csv('../user_data/tmp_data/decisionData.txt')

In [4]:
# 划分训练集和测试集
data = shuffle(data)
train_size =0.9 # 10%的数据作为测试集
data_trainSize= int(len(data)*train_size)
train_data = data.iloc[:data_trainSize,:] 
test_data = data.iloc[data_trainSize:,:]

In [5]:
train_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop =True,inplace=True)

In [6]:
# 挑出一致性正确的数据
train_data = train_data.loc[(train_data['一致性']==1),:]
test_data = test_data.loc[(test_data['一致性']==1),:]

In [7]:
# 挑出两边都有订单的数据
train_data = train_data.loc[(train_data['取单号']!=0)&(train_data['送单号']!=0),:]
test_data = test_data.loc[(test_data['取单号']!=0)&(test_data['送单号']!=0),:]

train_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop =True,inplace=True)

In [8]:
# 构建模型输入特征以及标签的方法
def getDis(data):
    data['距离差'] = data['商家距离'] - data['客户距离']
    data['理论时间差'] = data['理论取单时间'] -data['理论送单时间']
    data['承诺时间差'] = data['取单承诺送达时间'] - data['送单承诺送达时间']
    data['承诺理论时间差'] = data['送单承诺送达时间'] - data['理论送单时间']
    data['可取理论时间差'] = data['取单可取时间'] - data['理论取单时间']
    data['可取单数'] = data['最大负荷量'] - data['当前已取单数']
    data['标签'] = data['动作'].apply(lambda x:0 if x=='PICKUP' else 1)
    return data

In [9]:
train_data = getDis(train_data)
test_data = getDis(test_data)

In [10]:
# 排除掉距离差很多的数据
def getOutRange(data,x):
    dataRight = data.loc[(data['距离差']>=-x) &(data['距离差']<=x),:]
    return dataRight

In [12]:
# 根据数据分析，在取单和送单直接的距离差大于1200时，根据距离的大小，决定取单还是送单准确率在95%以上，因此这部分数据直接逻辑判断
train_data = getOutRange(train_data,1200)
test_data = getOutRange(test_data,1200)

train_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop =True,inplace=True)

In [13]:
# 根据距离差划分数据集分别建模
def divideModel(train,test,range1,range2,name,inputName,ntree,ndep):
    
    trainList=list() # 存放划分后的训练集
    testList =list() # 存放划分后的训练集
    modelList = list() # 存放对应数据的模型
    
    score_train=list() # 训练集模型的准确率
    score_test = list() # 测试集模型的准确率
    trainN =0 # 训练集样本个数
    testN =0  # 测试集样本个数
    trainRN=0 # 训练集预测正确的个数
    testRN =0 # 测试集预测正确的个数
    
    # 划分数据集
    for i in range(len(range1)):
        train_part = train.loc[(train[name]>=range1[i])&(train[name]<=range2[i]),:]
        test_part = test.loc[(test[name]>=range1[i])&(test[name]<=range2[i]),:]   
        
        trainList.append(train_part)
        testList.append(test_part)
    
    # 每部分数据分别建立随机森林模型
    for i in range(len(trainList)):
        x_train = trainList[i][inputName].values
        x_test = testList[i][inputName].values
        y_train = trainList[i]['标签'].values
        y_test = testList[i]['标签'].values
        
        model_RF = RandomForestClassifier(n_estimators=ntree[i],max_depth=ndep[i],random_state=0)
        model_RF.fit(x_train,y_train)
        train_score=model_RF.score(x_train,y_train)
        test_score=model_RF.score(x_test,y_test)
        modelList.append(model_RF)
        score_train.append(train_score)
        score_test.append(test_score)
     
    # 计算最后总的正确率
    for i in range(len(trainList)):
        trainN +=len(trainList[i])
        testN +=len(testList[i])
        
        trainRN +=score_train[i]*len(trainList[i])
        testRN +=score_test[i]*len(testList[i])
    
    return trainRN/trainN,testRN/testN,modelList
    
        
        

In [16]:
# 左区间
rg1 = [0,100,301,-1201,-399,-199,-99]
# 右区间
rg2 =[100,300,1201,-400,-200,-100,0]
#特征
columns = ['理论时间差','承诺时间差','可取理论时间差','可取单数']
# 每个模型对应的树个数和最大深度参数
ntree = [50,50,50,50,50,50,50]
ndep = [3,4,4,5,5,5,5]

score_train,score_test,modelList=divideModel(train_data,test_data,rg1,rg2,'距离差',columns,ntree,ndep)


In [17]:
# 保存模型
for i in range(len(modelList)):
    joblib.dump(modelList[i],'../user_data/model_data/decision_model/RFdecision_dis'+str(i+1)+'.pkl')