In [2]:
import os
import numpy as np
import pandas as pd
import time
from utm import *
from tqdm import tqdm, tqdm_pandas
from osgeo import osr
import coordTransform

# 预设地址和其他全局变量
feature_file_name = 'Intergated-DATASET-D'
raw_data_path = 'D:/Working/PRP-2021/Data/Raw_Data/'
feature_dst_path = 'D:/Working/PRP-2021/Data/Processed_Data/feature_analysis/' + feature_file_name + '.csv'
day_begin = '06:00:00'
day_end = '23:00:00'


In [None]:
#在此处设置时间窗(单位为3秒)和空间网格的边长(WGS84坐标系)
# 滞留时间阈值，超过阈值视为无效订单
time_interval = 200
space_interval = 70

# 设置时间区间 读取原数据
# 时间区间: 减少单次的处理量
jar = []

for date in range(20161101,20161131):
    if not os.path.exists('D:/Working/PRP-2021/Data/Processed_Data/feature_analysis/' + f'feature_{date}.csv'):

        time1 = f'{date} {day_begin}'
        time2 = f'{date} {day_end}'
        stamp1 = time.mktime(time.strptime(time1, '%Y%m%d %H:%M:%S'))
        stamp2 = time.mktime(time.strptime(time2, '%Y%m%d %H:%M:%S'))

        #导入原地理数据
        print(f'正在导入数据：gps_{date}.csv')
        # df = pd.read_csv(raw_data_path + f'gps_{date}.csv', header=None, nrows=10)
        df = pd.read_csv(raw_data_path + f'gps_{date}.csv', header=None)
        df.columns = ['driver_ID', 'order_ID', 'timestamp', 'lon', 'lat']
        df.timestamp = df.timestamp + 8*3600
        print(f'已导入数据：gps_{date}.csv')
        
        ## 只取预设时间区间内的数据
        df = df[(df['timestamp'] >= stamp1)&(df['timestamp'] < stamp2)].reset_index(drop = True)

        # 将空间坐标转换为WGS-84(耗时会很长)
        print (f'{date} 正在生成WGS-84')
        xy = df[['lon','lat']].apply(lambda x: coordTransform.gcj02_to_wgs84(x[0],x[1])[:2], axis = 1)
        
        df['lon'] = [x[0] for x in xy]
        df['lat'] = [x[1] for x in xy]
        print (f'{date} 已生成WGS-84')

        # 再把WGS-84转换为UTM平面直角系(保留WGS-84数据)
        wgs84 = osr.SpatialReference()
        wgs84.ImportFromEPSG(4326)
        # 2.Pseudo-Mercator
        inp = osr.SpatialReference()
        inp.ImportFromEPSG(3857)
        # 3.定义坐标变换映射
        transformation = osr.CoordinateTransformation(wgs84, inp)
        # 4.转换原数据的坐标
        print(f'{date} 正在转换原数据坐标')
        
        # xy = df[['lon','lat']].apply(lambda x: transformation.TransformPoint(x[0],x[1])[:2], axis=1)
        xy = df[['lon','lat']].apply(lambda x: transformation.TransformPoint(x[1],x[0])[:2], axis=1)
        print(f'{date} 已转换原数据坐标')
        
        # 5.写入df
        df['x'] = [x[0] for x in xy]
        df['y'] = [x[1] for x in xy]
        print (f'{date} 已生成UTM, 当前数据条数：',len(df))

        # 时间窗划分
        df['time_id'] = df.timestamp.apply(lambda x: (x - stamp1)//time_interval)
        
        # 空间网格划分
        # 1.计算左边界和上边界，左右-x， 上下-y
        left = df['x'].min()
        down = df['y'].max()

        # 2.生成横向和纵向索引
        df['row_id'] = df['y'].apply(lambda y: (y - down)//space_interval)
        df['col_id'] = df['x'].apply(lambda x: (x - left)//space_interval)
        print (f'{date} 已生成时空索引, 当前数据条数：',len(df))

        df = df.dropna()
        print (f'{date} 已处理空值, 当前数据条数：',len(df))

        # 下面开始时空特征提取

        #1. 计算瞬时速度
        # 排序：先按司机排，同司机按订单排，同订单再按时间排
        print(f'{date} 正在计算瞬时速度')
        df = df.sort_values(by = ['driver_ID', 'order_ID', 'timestamp']).reset_index(drop = True)
        # 将订单id下移一行，用于判断前后数据是否属于同一订单
        df['orderFlag'] = df['order_ID'].shift(1)
        df['identi'] = (df['orderFlag'] == df['order_ID']) #一个由boolean构成的列，方便后面所有shift完成了之后再删除分界行
        # 将坐标，时间戳下移一行，匹配相应轨迹点
        df['x1'] = df['x'].shift(1)
        df['y1'] = df['y'].shift(1)
        df['timestamp1'] = df['timestamp'].shift(1)
        # 将不属于同一订单的轨迹点删除
        df = df[df['identi'] == True]
        # 计算相邻轨迹点之间的距离和相差时间
        # 距离采用欧式距离
        dist = np.sqrt(np.square(df['x'].values - df['x1'].values) + np.square(df['y'].values - df['y1'].values))
        ttime = df['timestamp'].values - df['timestamp1'].values
        # 计算速度
        df['speed'] = dist/ttime
        # 删除临时数据
        df = df.drop(columns = ['x1', 'y1', 'orderFlag', 'timestamp1', 'identi'])
        print(f'{date} 已生成速度')

        # 2.计算瞬时加速度
        print(f'{date} 正在计算加速度')
        df['speed1'] = df['speed'].shift(1)
        df['timestamp1'] = df['timestamp'].shift(1)
        df['identi'] = df['order_ID'].shift(1)
        df = df[df.identi == df.order_ID]
        df['acc'] = (df.speed - df.speed1)/(df.timestamp - df.timestamp1)
        df = df.drop(columns = ['speed1', 'timestamp1', 'identi'])
        print(f'{date} 已生成加速度')

        df = df.reset_index(drop = True)

        # 下面计算集体/网格平均特征

        # 1. 网格平均速度：先求每辆车在网格中的平均速度，然后求网格中所有个体平均速度的均值
        # 基于时空网格和估计id分组
        print(f'{date} 正在生成平均网格速度')
        orderGrouped = df.groupby(['row_id', 'col_id', 'time_id', 'order_ID'])
        # 网格在每个时刻（时间窗）的平均速度
        grouped_speed = orderGrouped.speed.mean().reset_index()
        grouped_speed = grouped_speed.groupby(['row_id', 'col_id', 'time_id'])
        grid_speed = grouped_speed.speed.mean()
        # 去除异常值
        grid_speed = grid_speed.clip(grid_speed.quantile(0.05), grid_speed.quantile(0.95))
        print(f'{date} 已生成网格平均速度')

        # 2. 网格平均加速度
        print(f'{date} 正在生成网格平均加速度')
        gridGrouped = df.groupby(['row_id', 'col_id', 'time_id'])
        grid_acc = gridGrouped.acc.mean()
        print(f'{date} 已生成网格平均加速度')

        # 3.网格浮动车流量
        print(f'{date} 正在生成网格浮动车数量')
        grouped_volume = orderGrouped.speed.last().reset_index() #每个时空网格中的每个order只保留一辆（用last（）来取）
        grouped_volume = grouped_volume.groupby(['row_id', 'col_id', 'time_id'])
        grid_volume = grouped_volume['speed'].size()
        grid_volume = grid_volume.clip(grid_volume.quantile(0.05), grid_volume.quantile(0.95))
        print(f'{date} 已生成网格浮动车数量')

        # 4.网格车速标准差
        print(f'{date} 正在生成网格车速标准差')
        grid_v_std = gridGrouped.speed.std(ddof=0)
        # 去除异常值
        grid_v_std = grid_v_std.clip(grid_v_std.quantile(0.05), grid_v_std.quantile(0.95))
        print(f'{date} 已生成网格车速标准差')

        # 5.网格平均停车次数
        print(f'{date} 正在生成网格平均停车次数')
        stopNum = gridGrouped.speed.agg(lambda x: (x==0).sum())
        grid_stop = pd.concat((stopNum, grid_volume), axis = 1)
        grid_stop['stopNum'] = stopNum.values/ grid_volume.values
        grid_stop = grid_stop['stopNum']
        grid_stop = grid_stop.clip(0, grid_stop.quantile(0.95))
        print(f'{date} 已生成网格平均停车次数')

        feature = pd.concat([grid_speed, grid_acc, grid_volume, grid_v_std, grid_stop], axis = 1).reset_index()
        feature.columns = ['row_id','col_id', 'time_id', 'aveSpeed', 'gridAcc', 'volume', 'speedStd', 'stopNum']
        print(f'{date} 已整理网格特征')
        feature.sort_values(['stopNum']).reset_index(drop=True)
        feature['date'] = date

        jar.append(feature)
        feature.to_csv('D:/Working/PRP-2021/Data/Processed_Data/feature_analysis/' + f'feature_{date}.csv')
        print(f'{date} 处理完毕，放入list')
    
    else:
        feature = pd.read_csv('D:/Working/PRP-2021/Data/Processed_Data/feature_analysis/' + f'feature_{date}.csv')
        jar.append(feature)
        print(f'feature_{date} 处理完毕，放入list')

integrated_feature = pd.concat(jar, axis=0)
integrated_feature.to_csv(feature_dst_path, index = None)
print(f'已生成{feature_file_name}')


feature_20161101 处理完毕，放入list
feature_20161102 处理完毕，放入list
feature_20161103 处理完毕，放入list
feature_20161104 处理完毕，放入list
feature_20161105 处理完毕，放入list
feature_20161106 处理完毕，放入list
feature_20161107 处理完毕，放入list
feature_20161108 处理完毕，放入list
feature_20161109 处理完毕，放入list
feature_20161110 处理完毕，放入list
feature_20161111 处理完毕，放入list
feature_20161112 处理完毕，放入list
正在导入数据：gps_20161113.csv
已导入数据：gps_20161113.csv
20161113 正在生成WGS-84
20161113 已生成WGS-84


0

0

20161113 正在转换原数据坐标
20161113 已转换原数据坐标
20161113 已生成UTM, 当前数据条数： 16084285
20161113 已生成时空索引, 当前数据条数： 16084285
20161113 已处理空值, 当前数据条数： 16084285
20161113 正在计算瞬时速度
20161113 已生成速度
20161113 正在计算加速度
20161113 已生成加速度
20161113 正在生成平均网格速度
20161113 已生成网格平均速度
20161113 正在生成网格平均加速度
20161113 已生成网格平均加速度
20161113 正在生成网格浮动车数量
20161113 已生成网格浮动车数量
20161113 正在生成网格车速标准差
20161113 已生成网格车速标准差
20161113 正在生成网格平均停车次数
20161113 已生成网格平均停车次数
20161113 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,37.0,10.876096,-0.313061,1,0.000000,0.0
1,-63.0,4.0,255.0,17.260940,0.258015,12,5.368125,0.0
2,-63.0,4.0,256.0,16.647926,0.286440,8,7.210090,0.0
3,-63.0,4.0,257.0,20.247715,0.021924,9,6.567525,0.0
4,-63.0,4.0,258.0,16.506853,0.224448,16,5.965453,0.0
...,...,...,...,...,...,...,...,...
1082643,-66.0,44.0,239.0,3.509932,0.141124,1,1.557366,3.7
1082644,-87.0,1.0,255.0,4.517272,0.072816,4,2.997880,3.7
1082645,-128.0,46.0,237.0,4.408657,-0.054583,16,3.432487,3.7
1082646,-128.0,46.0,229.0,5.154871,-0.070078,13,3.201678,3.7


20161113 处理完毕，放入list
正在导入数据：gps_20161114.csv
已导入数据：gps_20161114.csv
20161114 正在生成WGS-84
20161114 已生成WGS-84


0

0

20161114 正在转换原数据坐标
20161114 已转换原数据坐标
20161114 已生成UTM, 当前数据条数： 17717385
20161114 已生成时空索引, 当前数据条数： 17717385
20161114 已处理空值, 当前数据条数： 17717385
20161114 正在计算瞬时速度
20161114 已生成速度
20161114 正在计算加速度
20161114 已生成加速度
20161114 正在生成平均网格速度
20161114 已生成网格平均速度
20161114 正在生成网格平均加速度
20161114 已生成网格平均加速度
20161114 正在生成网格浮动车数量
20161114 已生成网格浮动车数量
20161114 正在生成网格车速标准差
20161114 已生成网格车速标准差
20161114 正在生成网格平均停车次数
20161114 已生成网格平均停车次数
20161114 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,37.0,11.668589,0.707762,1,0.000000,0.0000
1,-64.0,33.0,190.0,6.185028,-0.335971,3,1.972004,0.0000
2,-64.0,33.0,191.0,9.208976,-0.240074,1,3.758844,0.0000
3,-64.0,33.0,192.0,6.699042,0.234558,2,2.306629,0.0000
4,-64.0,33.0,194.0,7.444814,-0.071790,3,1.737909,0.0000
...,...,...,...,...,...,...,...,...
1088467,-129.0,46.0,94.0,3.470819,0.005145,1,2.759617,3.9375
1088468,-37.0,91.0,65.0,3.470819,-0.344272,1,4.190357,3.9375
1088469,-129.0,46.0,103.0,3.470819,-0.494086,1,1.682019,3.9375
1088470,-120.0,27.0,276.0,3.470819,-0.152711,1,0.741404,3.9375


20161114 处理完毕，放入list
正在导入数据：gps_20161115.csv
已导入数据：gps_20161115.csv
20161115 正在生成WGS-84
20161115 已生成WGS-84


0

0

20161115 正在转换原数据坐标
20161115 已转换原数据坐标
20161115 已生成UTM, 当前数据条数： 17566478
20161115 已生成时空索引, 当前数据条数： 17566478
20161115 已处理空值, 当前数据条数： 17566478
20161115 正在计算瞬时速度
20161115 已生成速度
20161115 正在计算加速度
20161115 已生成加速度
20161115 正在生成平均网格速度
20161115 已生成网格平均速度
20161115 正在生成网格平均加速度
20161115 已生成网格平均加速度
20161115 正在生成网格浮动车数量
20161115 已生成网格浮动车数量
20161115 正在生成网格车速标准差
20161115 已生成网格车速标准差
20161115 正在生成网格平均停车次数
20161115 已生成网格平均停车次数
20161115 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,38.0,11.815249,-0.577240,1,0.000000,0.0
1,-64.0,26.0,226.0,7.473412,-0.827262,1,0.734049,0.0
2,-64.0,26.0,229.0,7.735288,0.733287,1,0.000000,0.0
3,-64.0,26.0,230.0,5.109083,0.104107,1,0.581427,0.0
4,-64.0,26.0,234.0,6.446372,0.271838,1,0.274021,0.0
...,...,...,...,...,...,...,...,...
1083506,-123.0,49.0,213.0,3.504182,0.014850,2,1.246056,3.7
1083507,-81.0,75.0,106.0,3.504182,0.124682,1,1.961193,3.7
1083508,-81.0,75.0,88.0,3.504182,-0.154405,1,2.232226,3.7
1083509,-99.0,18.0,86.0,6.392409,-0.028075,8,4.787592,3.7


20161115 处理完毕，放入list
正在导入数据：gps_20161116.csv
已导入数据：gps_20161116.csv
20161116 正在生成WGS-84
20161116 已生成WGS-84


0

0

20161116 正在转换原数据坐标
20161116 已转换原数据坐标
20161116 已生成UTM, 当前数据条数： 18493371
20161116 已生成时空索引, 当前数据条数： 18493371
20161116 已处理空值, 当前数据条数： 18493371
20161116 正在计算瞬时速度
20161116 已生成速度
20161116 正在计算加速度
20161116 已生成加速度
20161116 正在生成平均网格速度
20161116 已生成网格平均速度
20161116 正在生成网格平均加速度
20161116 已生成网格平均加速度
20161116 正在生成网格浮动车数量
20161116 已生成网格浮动车数量
20161116 正在生成网格车速标准差
20161116 已生成网格车速标准差
20161116 正在生成网格平均停车次数
20161116 已生成网格平均停车次数
20161116 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,41.0,12.737183,-0.107840,1,0.000000,0.00
1,-64.0,69.0,180.0,18.125272,0.006417,10,4.008686,0.00
2,-64.0,69.0,181.0,15.756869,0.318484,18,4.259679,0.00
3,-64.0,69.0,182.0,15.691895,0.353209,16,4.630376,0.00
4,-64.0,69.0,183.0,16.167559,-0.118217,12,3.910953,0.00
...,...,...,...,...,...,...,...,...
1089385,-123.0,95.0,155.0,7.146759,-0.089867,4,4.120526,3.75
1089386,-123.0,95.0,154.0,3.591374,-0.182427,1,3.136268,3.75
1089387,-123.0,95.0,153.0,7.780208,-0.196974,3,3.866416,3.75
1089388,-85.0,50.0,252.0,3.591374,-0.067792,2,2.024118,3.75


20161116 处理完毕，放入list
正在导入数据：gps_20161117.csv
已导入数据：gps_20161117.csv
20161117 正在生成WGS-84
20161117 已生成WGS-84


0

0

20161117 正在转换原数据坐标
20161117 已转换原数据坐标
20161117 已生成UTM, 当前数据条数： 17681301
20161117 已生成时空索引, 当前数据条数： 17681301
20161117 已处理空值, 当前数据条数： 17681301
20161117 正在计算瞬时速度
20161117 已生成速度
20161117 正在计算加速度
20161117 已生成加速度
20161117 正在生成平均网格速度
20161117 已生成网格平均速度
20161117 正在生成网格平均加速度
20161117 已生成网格平均加速度
20161117 正在生成网格浮动车数量
20161117 已生成网格浮动车数量
20161117 正在生成网格车速标准差
20161117 已生成网格车速标准差
20161117 正在生成网格平均停车次数
20161117 已生成网格平均停车次数
20161117 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,40.0,6.650290,0.169125,1,0.000000,0.000
1,-64.0,6.0,299.0,8.476843,-0.042759,4,1.462128,0.000
2,-64.0,6.0,300.0,6.838518,0.617651,4,2.470529,0.000
3,-64.0,6.0,301.0,11.613019,0.346002,4,7.449077,0.000
4,-64.0,6.0,302.0,7.503581,0.303479,4,2.182665,0.000
...,...,...,...,...,...,...,...,...
1105103,-62.0,137.0,174.0,3.508963,-0.119251,1,0.977480,3.625
1105104,-62.0,137.0,172.0,3.508963,-0.235378,1,1.244128,3.625
1105105,-106.0,92.0,298.0,3.508963,-0.050021,1,2.320652,3.625
1105106,-70.0,45.0,250.0,3.508963,-0.105767,1,1.876061,3.625


20161117 处理完毕，放入list
正在导入数据：gps_20161118.csv
已导入数据：gps_20161118.csv
20161118 正在生成WGS-84
20161118 已生成WGS-84


0

0

20161118 正在转换原数据坐标
20161118 已转换原数据坐标
20161118 已生成UTM, 当前数据条数： 19881860
20161118 已生成时空索引, 当前数据条数： 19881860
20161118 已处理空值, 当前数据条数： 19881860
20161118 正在计算瞬时速度
20161118 已生成速度
20161118 正在计算加速度
20161118 已生成加速度
20161118 正在生成平均网格速度
20161118 已生成网格平均速度
20161118 正在生成网格平均加速度
20161118 已生成网格平均加速度
20161118 正在生成网格浮动车数量
20161118 已生成网格浮动车数量
20161118 正在生成网格车速标准差
20161118 已生成网格车速标准差
20161118 正在生成网格平均停车次数
20161118 已生成网格平均停车次数
20161118 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,38.0,8.851204,1.326331e-01,1,0.000000,0.000000
1,-64.0,63.0,299.0,11.099940,1.652092e-01,8,3.018096,0.000000
2,-64.0,63.0,300.0,11.865594,2.489627e-01,7,3.240044,0.000000
3,-64.0,63.0,301.0,14.440005,9.558836e-01,12,3.059789,0.000000
4,-64.0,63.0,302.0,12.024316,2.708022e-01,11,2.689350,0.000000
...,...,...,...,...,...,...,...,...
1114333,-45.0,63.0,199.0,3.519221,-2.176755e-01,1,1.221214,3.842105
1114334,-45.0,63.0,194.0,3.519221,-7.930164e-18,1,1.425968,3.842105
1114335,-116.0,106.0,208.0,3.519221,-4.814001e-01,1,1.819248,3.842105
1114336,-9.0,18.0,293.0,5.930882,2.272883e-01,5,3.829689,3.842105


20161118 处理完毕，放入list
正在导入数据：gps_20161119.csv


In [None]:
integrated_feature.drop(['Unnamed: 0'], axis=1)

In [None]:
integrated_feature.to_csv(feature_dst_path, index = None)