In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
%matplotlib inline
import time
from sklearn.metrics import calinski_harabaz_score
import math
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.externals import joblib
#解决中文显示问题
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

In [2]:
# 读入历史数据
def getDataHistory(data_path,IDlist):
    dictData = {}
    for ID in IDlist:
        data = pd.read_csv(data_path+ID+'/order_history',sep="\t")
        dictData.update({ID:data})

    return dictData

In [3]:
def getShopModel(model_path,IDlist):
    cla_dict={} # 商家聚类中心
    model_dict={} # 商家聚类模型
    for ID in IDlist:
        model = joblib.load(model_path+ID+'.pkl')
        model_dict.update({ID:model})
        df= pd.read_csv(model_path+ID+'zone.txt')
        cla_dict.update({ID:df})
    zhongxin_df = pd.read_csv(model_path+'zhongxin.txt')
    return cla_dict,zhongxin_df,model_dict

In [4]:
# 传入一个商圈df,生成聚类模型的输入
def prosse(data):
    data.drop_duplicates(subset=['srclatitude','srclongitude'],inplace=True)
    lat= data['srclatitude'].values
    lon = data['srclongitude'].values
    location = []
    for i,j in zip(lat,lon):
        location.append([i,j])
    location = np.array(location)
    return location

In [5]:
# 商家聚类
def JUlie(data,n):
    mean = data.mean(axis=0)
    kmeans = KMeans(n_clusters =n,random_state=1)
    kmeans.fit(data)
    list_class=[]
    class_mean =[]
    #print(kmeans.labels_)
    for i in range(n):
        classN = data[(kmeans.labels_==i)]
        list_class.append(classN)
        a = classN.mean(axis=0)
        class_mean.append(a)
        
    #pig = plt.figure(figsize=(12,12))
    #color =['red','green','black','blue','orange','purple']
    #for i,classn in enumerate(list_class):
        #plt.scatter(classn[:,0],classn[:,1],c=color[i])
        #plt.scatter(mean[0],mean[1],c='purple')
    #score =calinski_harabaz_score(data_,kmeans.labels_)
    #print(score)
    #plt.show()
    #plt.close()
    
    return class_mean,mean,kmeans

In [6]:
# 采用 传入 商圈DF列表，和对应的商圈ID列表，以及每个商圈对应要聚成的个数字典
def shopJUlie(datalist,IDlist,shopN,modelPath):
    list_ID = []
    list_lat =[]
    list_lon = []
    for ID in IDlist:
        data = datalist.get(ID)
        n = shopN.get(ID)
        location = prosse(data)
        cla_me,me,model=JUlie(location,n)
        joblib.dump(model,modelPath+ID+'.pkl')
        dict_class = {}
       
        list_ID.append(ID)
        list_lat.append(me[0])
        list_lon.append(me[1])
        for i in range(n):
            s1 ='zonelat'+str(i)
            s2 = 'zonelon'+str(i)
            dict_class.update({s1:[cla_me[i][0]],s2:[cla_me[i][1]]})
            dfcla = pd.DataFrame(dict_class)
            dfcla.to_csv(modelPath+ID+'zone.txt',index=False,encoding='utf-8')
        
    dict_shangquzhongxin={'商圈':list_ID,'lat':list_lat,'lon':list_lon}
    df= pd.DataFrame(dict_shangquzhongxin)
    df.to_csv(model_path+'zhongxin.txt',index=False,encoding='utf-8')    

In [7]:
# 求角度
def Ran(lat1,lng1,lat2,lng2):
    lat = lat1 - lat2
    lon = lng1 -lng2

    if lat<0:
        if lon>0:
            tan =  -lon/lat
            R = math.atan(tan)
            return (math.pi -R)
        elif lon<0:
            tan =  lon/lat
            R = math.atan(tan)
            return(math.pi +R)
        else:
            return math.pi
    elif lat >0:
        if lon>0:
            tan =  lon/lat
            return math.atan(tan)
        elif lon<0:
            tan =  -lon/lat
            R = math.atan(tan)
            return (2*math.pi -R)
        else:
            return 0
    else:
        if lon>0:
            return math.pi/2
        elif lon<0:
            return 3*math.pi/2
        else:
            return 0

In [8]:
# 给商家打标签
def label_shop_use(datalist,model_dict):
    for ID in IDlist:
        data = datalist.get(ID)
        #print(data.head())
        data['商家类别'] =0
        model = model_dict.get(ID)    
        data.reset_index(drop=True,inplace=True)
        for i in range(len(data)):
            shop_lat = data.loc[i,'srclatitude']
            shop_lon = data.loc[i,'srclongitude']
            shop_label = model.predict([[shop_lat,shop_lon]])[0]
            data.loc[i,'商家类别'] = shop_label
        datalist.update({ID:data})
    return datalist

In [9]:
# 对第某一个商圈中的第i个商家中心进行用户聚类
def usr_julei(data,usrN,shop_lat,shop_lon):
    data_new =data.drop_duplicates(subset=['dstlatitude','dstlongitude'])
    data_new.reset_index(drop =True,inplace=True)
    data_new['角度'] = 0
    for i in range(len(data_new)):
        lat = data_new.loc[i,'dstlatitude']
        lon = data_new.loc[i,'dstlongitude']
        r=Ran(lat,lon,shop_lat,shop_lon)
        data_new.loc[i,'角度'] = r
    
    inp = data_new['角度'].values
    inputs = [[value] for value in inp]
    kmeans = KMeans(n_clusters =usrN,random_state=1)
    kmeans.fit(inputs)
    
    x_lat = data_new['dstlatitude'].values
    y_lon = data_new['dstlongitude'].values
    
    list_lat=[]
    list_lon=[]
    for i in range(usrN):
        lat1 = x_lat[(kmeans.labels_==i)]
        lon1 = y_lon[(kmeans.labels_==i)]
        list_lat.append(lat1)
        list_lon.append(lon1)
       
        
    #pig = plt.figure(figsize=(12,12))
   # color =['red','green','black','blue','orange','purple']
   # for i in range(usrN):
        #plt.scatter(list_lat[i][:],list_lon[i][:],c=color[i])
       # plt.scatter(shop_lat,shop_lon,c='purple')
    #plt.show()
    #plt.close()
    return kmeans

In [10]:
# 对一个商圈的每一个商家聚类中心进行用户聚类
def shangquan(data,df_shop,usrN,modelPath,ID):
    n = int(len(df_shop.columns)/2)
    model_list=[]
    for i in range(n):
        strlat = 'zonelat'+str(i)
        strlon = 'zonelon'+str(i)
        shop_lat = df_shop.loc[0,strlat]
        shop_lon = df_shop.loc[0,strlon]
        model =usr_julei(data,usrN,shop_lat,shop_lon)
        joblib.dump(model,modelPath+ID+str(i)+'.pkl')

In [11]:
# 生成所有商圈的用户聚类中心模型
def usr_model(datalist,IDlist,cla_dict,dict_user,modelPath):
    for ID in IDlist:
        data = datalist.get(ID)
        df_shop = cla_dict.get(ID)
        usrN = dict_user.get(ID)
        shangquan(data,df_shop,usrN,modelPath,ID)

In [12]:
def timePross(data):
    result = data % 86400
    if result >18000:
        result = 140
    return result

In [13]:
# 计算一个商圈从小到大每个时刻的累计订单量
def tongji(data):
    data['createtime'] =data['createtime'].apply(timePross)
    data.sort_values(by=['createtime'],inplace=True)
    maxTime = data['createtime'].max()
    minTime =data['createtime'].min()
    i =1
    time1=minTime
    timelist =[]
    while time1 <=maxTime:
        time1 = minTime+i*60
        i+=1
        timelist.append(time1)
    df= pd.DataFrame({'time':timelist})
    df.loc[0,'newOrder'] = len(data.loc[(data['createtime']<=df.loc[0,'time']),:])
    for i in range(1,len(df)):
        time1 = df.loc[i-1,'time']
        time2 = df.loc[i,'time']
        ordern = len(data.loc[(data['createtime']>time1)&(data['createtime']<=time2),:])
        df.loc[i,'newOrder'] = ordern
    df['allOreder']=df['newOrder'].cumsum(axis=0)
    #fig = plt.figure(figsize=(15,10))
    #plt.plot(df['time'],df['allOreder'],c='red',label='累计订单')
    #plt.show()
    #plt.close()
    return df

In [34]:
# 对一个商圈进行时间划分，
def timeQun(data,IsOrderNumber,Number):
    timelist = [[] for i in range(Number)] # 存放分时间区间
    timelist[0].append(0)
    if IsOrderNumber: # 如果是按照等订单方式进行划分
        df=tongji(data)
        orderAll = df.loc[len(df)-1,'allOreder'] # 总的订单数
        disOrder = int(orderAll /Number) # 每个区间的订单数
        j=0
        for i in range(Number-1):
            orderN = disOrder*(i+1)
            
            while True:
                j +=1
                if j<len(df):
                    if df.loc[j,'allOreder'] >=orderN:
                        break
                else:
                    break          
            time = df.loc[j,'time']
            timelist[i].append(time)
            timelist[i+1].append(time)
        timelist[Number-1].append(18000)
    else:
        data['createtime'] =data['createtime'].apply(timePross)
        # 根据等时间划分区间
        start = data['createtime'].min()
        end = data['createtime'].max()
        distime = int((end-start)/Number) #时间间隔
        
        for i in range(Number-1):
            timelist[i].append(distime*(i+1))
            timelist[i+1].append(distime*(i+1))
        
        timelist[Number-1].append(18000)
    return timelist

In [15]:
# 对所有的商圈进行时间划分
def timeAll(datalist,IDlist,Number_dict,ISorderNumber):
    dict_time ={}
    for ID in IDlist:
        data = datalist.get(ID)
        Number = Number_dict.get(ID)
        timelist = timeQun(data,ISorderNumber,Number)
        
        dict_time.update({ID:timelist})
    return dict_time

In [16]:
# 某一个商圈，每个时段，每个商家中心的用户中心的蓄单量
def orderlimit(ID,savePath,orderNumber):
    wt = open(savePath+ID+'b.txt','w')
    for st in orderNumber:# 每个时间段
        string=[]
        for i in st: # 每个商家聚类中心
            string.append(str(i[0]))
            string.append('#')
        a=''
        for j in range(len(string)-1):
            a= a+string[j]
        wt.write(a+'\n')
    wt.close()

In [17]:
# 将所有商圈的用户中心个数写入
def orderlimitAll(IDlist,savePath,orderNumber_dict):
    for ID in IDlist:
        orderlimit(ID,savePath,orderNumber_dict.get(ID))
        

In [18]:
# 计算某一个商家聚类中心每个时间段的订单产生概率
def probability(data,time1):
    shopN = len(data['商家类别'].unique())
    shop_pro =[] # 每个时段每个商家类别产生概率
    order_time=[] # 每个时间商圈产生的订单总量
    data['createtime'] =data['createtime'].apply(timePross)
    for t in time1:
        data1 = data.loc[(data['createtime']>= t[0]) &(data['createtime']<t[1]),:]
        data1_list=[]
        shop =[] # 每个商家类别产生的概率        
        allLenth = len(data1)
        order_time.append(allLenth)
        for i in range(shopN):
            df = data1.loc[data1['商家类别']==i,:]
            lenth_shop = len(df)
            shop.append((lenth_shop/allLenth))
            data1_list.append(df)
                   
        shop_pro.append(shop)
    return shop_pro,order_time


In [19]:
def getPro(data_dict,time_dict,IDlist):
    shoppro_dict ={}
    order_dict={}
    for ID in IDlist:
        data = data_dict.get(ID)
        time1 = time_dict.get(ID)
        s,o = probability(data,time1)
        shoppro_dict.update({ID:s})
        order_dict.update({ID:o})
    return shoppro_dict,order_dict

In [1]:
# 设置骑士的额定取单容量
def riderLoad(dict_time,shopN_dict,dict_orderlimit,IDlist):
    rider_limit ={}
    for ID in IDlist:
        shopn = shopN_dict.get(ID)
        timen = dict_time.get(ID)
        ordern = dict_time.get(ID)
        order_limit = [[[ordern[i]] for j in range(shopn)] for i in range(timen)]
        rider_limit.update({ID:order_limit})
        
    

In [20]:
# 写入信息
def write_csv(zhongxin_df,cla,time1,shop,order,ID,model_path,usrn):
    column_pro = []
    column_lat = []
    column_lon = []
    tN = len(order) # 时间区间个数
    sN = len(shop[0]) # 商家聚类中心个数
    timearry=[]
    for i in range(tN):
        timearry.append(time1[i][1])
    df = pd.DataFrame({'时间':timearry})
    
    for i in range(sN):
        column_pro.append('概率'+str(i))
        
    for i in range(sN):
        column_lat.append('zonelat'+str(i))
        column_lon.append('zonelon'+str(i))
    
    for name in column_pro:
        df[name]=0
    for name in column_lat:
        df[name] = 0
    for name in column_lon:
        df[name] =0
    
    df['订单数'] =0
    df['用户']=usrn
    for i in range(len(df)):
        for j in range(sN):
            
            df.loc[i,column_pro[j]] = shop[i][j]
            df.loc[i,column_lat[j]] = cla.loc[0,column_lat[j]]
            df.loc[i,column_lon[j]] = cla.loc[0,column_lon[j]]
        df.loc[i,'订单数'] = order[i]
        
    df['中心lat'] = zhongxin_df.loc[zhongxin_df['商圈']==int(ID),'lat'].values[0]
    df['中心lon'] = zhongxin_df.loc[zhongxin_df['商圈']==int(ID),'lon'].values[0]
        
    df.to_csv(model_path+ID+'a.txt',index=False,encoding='utf-8')

In [21]:
data_path='./order_history_20200420/'  # 原始数据读入路径
IDlist = ['680507','725011','730221','738333','738432','1048660'] 
model_path = '../dispatch/dispatch-demo-py/demo/model/'  # 模型结果保存路径


shopN_dict = {IDlist[0]:11,IDlist[1]:9,IDlist[2]:7,IDlist[3]:9,IDlist[4]:9,IDlist[5]:9} # 每个商圈要聚类的个数
dict_user ={IDlist[0]:7,IDlist[1]:9,IDlist[2]:7,IDlist[3]:9,IDlist[4]:9,IDlist[5]:9}  # 每个商圈的每个商家聚类中心聚类个数
dict_time = {IDlist[0]:5,IDlist[1]:5,IDlist[2]:5,IDlist[3]:5,IDlist[4]:5,IDlist[5]:5} # 每个商圈的时间划分区间个数
# 每个时段，骑士的额定取单数量
shop1 =[1,2,2,2,2]
shop2 =[1,2,2,3,3]
shop3 =[1,2,2,3,3]
shop4 =[1,2,2,3,3]
shop5 =[1,2,2,3,3]
shop6 =[1,2,2,3,3]
dict_orderlimit = {IDlist[0]:shop1,IDlist[1]:shop2,IDlist[2]:shop3,IDlist[3]:shop4,IDlist[4]:shop5,IDlist[5]:shop6} # 骑手额定容量


In [None]:
# 商家聚类
dataDict = getDataHistory(data_path,IDlist)
shopJUlie(dataDict,IDlist,shopN_dict,model_path)

In [None]:
# 用户聚类
model_read = model_path
dataDict = getDataHistory(data_path,IDlist)
cla_dict,zhongxin,model_dict=getShopModel(model_read,IDlist)
usr_model(dataDict,IDlist,cla_dict,dict_user,model_read)

In [None]:
# 时间区间划分
dataDict = getDataHistory(data_path,IDlist)
IsOrderNumber = True
time_dict =timeAll(dataDict,IDlist,dict_time,IsOrderNumber)

In [None]:
# 额定骑手容量
rider_limit=riderLoad(dict_time,shopN_dict,dict_orderlimit,IDlist)
orderlimitAll(IDlist,model_path,rider_limit)

In [None]:
# 计算每个时刻商家聚类中心的订单产生概率
dataDict = getDataHistory(data_path,IDlist)
# 先打商家标签
dataDict=label_shop_use(dataDict,model_dict)
shopp_dict,shop_order=getPro(dataDict,time_dict,IDlist)

In [None]:
# 保存数据
for ID in IDlist:
    write_csv(zhongxin,cla_dict.get(ID),time_dict.get(ID),shopp_dict.get(ID),shop_order.get(ID),ID,model_path,dict_user.get(ID))            