In [2]:
import os
import numpy as np
import pandas as pd
import time
from utm import *
from tqdm import tqdm, tqdm_pandas
from osgeo import osr
import coordTransform

# 预设地址和其他全局变量
feature_file_name = 'Intergated-DATASET-D'
raw_data_path = 'D:/Working/PRP-2021/Data/Raw_Data/'
feature_dst_path = 'D:/Working/PRP-2021/Data/Processed_Data/feature_analysis/' + feature_file_name + '.csv'
day_begin = '06:00:00'
day_end = '23:00:00'


In [None]:
#在此处设置时间窗(单位为3秒)和空间网格的边长(WGS84坐标系)
# 滞留时间阈值，超过阈值视为无效订单
time_interval = 200
space_interval = 70

# 设置时间区间 读取原数据
# 时间区间: 减少单次的处理量
jar = []

for date in range(20161101,20161131):
    if not os.path.exists('D:/Working/PRP-2021/Data/Processed_Data/feature_analysis/' + f'feature_{date}.csv'):

        time1 = f'{date} {day_begin}'
        time2 = f'{date} {day_end}'
        stamp1 = time.mktime(time.strptime(time1, '%Y%m%d %H:%M:%S'))
        stamp2 = time.mktime(time.strptime(time2, '%Y%m%d %H:%M:%S'))

        #导入原地理数据
        print(f'正在导入数据：gps_{date}.csv')
        # df = pd.read_csv(raw_data_path + f'gps_{date}.csv', header=None, nrows=10)
        df = pd.read_csv(raw_data_path + f'gps_{date}.csv', header=None)
        df.columns = ['driver_ID', 'order_ID', 'timestamp', 'lon', 'lat']
        df.timestamp = df.timestamp + 8*3600
        print(f'已导入数据：gps_{date}.csv')
        
        ## 只取预设时间区间内的数据
        df = df[(df['timestamp'] >= stamp1)&(df['timestamp'] < stamp2)].reset_index(drop = True)

        # 将空间坐标转换为WGS-84(耗时会很长)
        print (f'{date} 正在生成WGS-84')
        xy = df[['lon','lat']].apply(lambda x: coordTransform.gcj02_to_wgs84(x[0],x[1])[:2], axis = 1)
        
        df['lon'] = [x[0] for x in xy]
        df['lat'] = [x[1] for x in xy]
        print (f'{date} 已生成WGS-84')

        # 再把WGS-84转换为UTM平面直角系(保留WGS-84数据)
        wgs84 = osr.SpatialReference()
        wgs84.ImportFromEPSG(4326)
        # 2.Pseudo-Mercator
        inp = osr.SpatialReference()
        inp.ImportFromEPSG(3857)
        # 3.定义坐标变换映射
        transformation = osr.CoordinateTransformation(wgs84, inp)
        # 4.转换原数据的坐标
        print(f'{date} 正在转换原数据坐标')
        
        # xy = df[['lon','lat']].apply(lambda x: transformation.TransformPoint(x[0],x[1])[:2], axis=1)
        xy = df[['lon','lat']].apply(lambda x: transformation.TransformPoint(x[1],x[0])[:2], axis=1)
        print(f'{date} 已转换原数据坐标')
        
        # 5.写入df
        df['x'] = [x[0] for x in xy]
        df['y'] = [x[1] for x in xy]
        print (f'{date} 已生成UTM, 当前数据条数：',len(df))

        # 时间窗划分
        df['time_id'] = df.timestamp.apply(lambda x: (x - stamp1)//time_interval)
        
        # 空间网格划分
        # 1.计算左边界和上边界，左右-x， 上下-y
        left = df['x'].min()
        down = df['y'].max()

        # 2.生成横向和纵向索引
        df['row_id'] = df['y'].apply(lambda y: (y - down)//space_interval)
        df['col_id'] = df['x'].apply(lambda x: (x - left)//space_interval)
        print (f'{date} 已生成时空索引, 当前数据条数：',len(df))

        df = df.dropna()
        print (f'{date} 已处理空值, 当前数据条数：',len(df))

        # 下面开始时空特征提取

        #1. 计算瞬时速度
        # 排序：先按司机排，同司机按订单排，同订单再按时间排
        print(f'{date} 正在计算瞬时速度')
        df = df.sort_values(by = ['driver_ID', 'order_ID', 'timestamp']).reset_index(drop = True)
        # 将订单id下移一行，用于判断前后数据是否属于同一订单
        df['orderFlag'] = df['order_ID'].shift(1)
        df['identi'] = (df['orderFlag'] == df['order_ID']) #一个由boolean构成的列，方便后面所有shift完成了之后再删除分界行
        # 将坐标，时间戳下移一行，匹配相应轨迹点
        df['x1'] = df['x'].shift(1)
        df['y1'] = df['y'].shift(1)
        df['timestamp1'] = df['timestamp'].shift(1)
        # 将不属于同一订单的轨迹点删除
        df = df[df['identi'] == True]
        # 计算相邻轨迹点之间的距离和相差时间
        # 距离采用欧式距离
        dist = np.sqrt(np.square(df['x'].values - df['x1'].values) + np.square(df['y'].values - df['y1'].values))
        ttime = df['timestamp'].values - df['timestamp1'].values
        # 计算速度
        df['speed'] = dist/ttime
        # 删除临时数据
        df = df.drop(columns = ['x1', 'y1', 'orderFlag', 'timestamp1', 'identi'])
        print(f'{date} 已生成速度')

        # 2.计算瞬时加速度
        print(f'{date} 正在计算加速度')
        df['speed1'] = df['speed'].shift(1)
        df['timestamp1'] = df['timestamp'].shift(1)
        df['identi'] = df['order_ID'].shift(1)
        df = df[df.identi == df.order_ID]
        df['acc'] = (df.speed - df.speed1)/(df.timestamp - df.timestamp1)
        df = df.drop(columns = ['speed1', 'timestamp1', 'identi'])
        print(f'{date} 已生成加速度')

        df = df.reset_index(drop = True)

        # 下面计算集体/网格平均特征

        # 1. 网格平均速度：先求每辆车在网格中的平均速度，然后求网格中所有个体平均速度的均值
        # 基于时空网格和估计id分组
        print(f'{date} 正在生成平均网格速度')
        orderGrouped = df.groupby(['row_id', 'col_id', 'time_id', 'order_ID'])
        # 网格在每个时刻（时间窗）的平均速度
        grouped_speed = orderGrouped.speed.mean().reset_index()
        grouped_speed = grouped_speed.groupby(['row_id', 'col_id', 'time_id'])
        grid_speed = grouped_speed.speed.mean()
        # 去除异常值
        grid_speed = grid_speed.clip(grid_speed.quantile(0.05), grid_speed.quantile(0.95))
        print(f'{date} 已生成网格平均速度')

        # 2. 网格平均加速度
        print(f'{date} 正在生成网格平均加速度')
        gridGrouped = df.groupby(['row_id', 'col_id', 'time_id'])
        grid_acc = gridGrouped.acc.mean()
        print(f'{date} 已生成网格平均加速度')

        # 3.网格浮动车流量
        print(f'{date} 正在生成网格浮动车数量')
        grouped_volume = orderGrouped.speed.last().reset_index() #每个时空网格中的每个order只保留一辆（用last（）来取）
        grouped_volume = grouped_volume.groupby(['row_id', 'col_id', 'time_id'])
        grid_volume = grouped_volume['speed'].size()
        grid_volume = grid_volume.clip(grid_volume.quantile(0.05), grid_volume.quantile(0.95))
        print(f'{date} 已生成网格浮动车数量')

        # 4.网格车速标准差
        print(f'{date} 正在生成网格车速标准差')
        grid_v_std = gridGrouped.speed.std(ddof=0)
        # 去除异常值
        grid_v_std = grid_v_std.clip(grid_v_std.quantile(0.05), grid_v_std.quantile(0.95))
        print(f'{date} 已生成网格车速标准差')

        # 5.网格平均停车次数
        print(f'{date} 正在生成网格平均停车次数')
        stopNum = gridGrouped.speed.agg(lambda x: (x==0).sum())
        grid_stop = pd.concat((stopNum, grid_volume), axis = 1)
        grid_stop['stopNum'] = stopNum.values/ grid_volume.values
        grid_stop = grid_stop['stopNum']
        grid_stop = grid_stop.clip(0, grid_stop.quantile(0.95))
        print(f'{date} 已生成网格平均停车次数')

        feature = pd.concat([grid_speed, grid_acc, grid_volume, grid_v_std, grid_stop], axis = 1).reset_index()
        feature.columns = ['row_id','col_id', 'time_id', 'aveSpeed', 'gridAcc', 'volume', 'speedStd', 'stopNum']
        print(f'{date} 已整理网格特征')
        feature.sort_values(['stopNum']).reset_index(drop=True)
        feature['date'] = date

        jar.append(feature)
        feature.to_csv('D:/Working/PRP-2021/Data/Processed_Data/feature_analysis/' + f'feature_{date}.csv')
        print(f'{date} 处理完毕，放入list')
    
    else:
        feature = pd.read_csv('D:/Working/PRP-2021/Data/Processed_Data/feature_analysis/' + f'feature_{date}.csv')
        jar.append(feature)
        print(f'feature_{date} 处理完毕，放入list')

integrated_feature = pd.concat(jar, axis=0)
integrated_feature.to_csv(feature_dst_path, index = None)
print(f'已生成{feature_file_name}')


正在导入数据：gps_20161101.csv
已导入数据：gps_20161101.csv
20161101 正在生成WGS-84
20161101 已生成WGS-84


0

0

20161101 正在转换原数据坐标


0    [104.07260397862196, 30.729617641063843]
1    [104.07260398874016, 30.729397749048378]
2     [104.07251412961236, 30.72909798960918]
3    [104.07244424804763, 30.728678268441094]
4     [104.0724442701216, 30.728198503676833]
5     [104.07243430177668, 30.72781870021522]
6    [104.07236442739094, 30.727249052301808]
7     [104.07223462690926, 30.72693934028506]
8    [104.07204490933553, 30.726719648488814]
9     [104.07181525429485, 30.72644003035685]
dtype: object

Unnamed: 0,lon,lat
0,104.072604,30.729618
1,104.072604,30.729398
2,104.072514,30.729098
3,104.072444,30.728678
4,104.072444,30.728199
5,104.072434,30.727819
6,104.072364,30.727249
7,104.072235,30.726939
8,104.072045,30.72672
9,104.071815,30.72644


20161101 已转换原数据坐标


0     (11585309.280430213, 3597684.406647787)
1    (11585309.281556565, 3597655.9299446233)
2     (11585299.278484216, 3597617.110265155)
3    (11585291.499304015, 3597562.7554116645)
4    (11585291.501761278, 3597500.6250647185)
5     (11585290.392090198, 3597451.440094536)
6     (11585282.613709157, 3597377.670420278)
7    (11585268.164385634, 3597337.5627505435)
8      (11585247.04512193, 3597309.112766901)
9    (11585221.480039744, 3597272.9024449256)
dtype: object

20161101 已生成UTM, 当前数据条数： 16571199
20161101 已生成时空索引, 当前数据条数： 16571199
20161101 已处理空值, 当前数据条数： 16571199
20161101 正在计算瞬时速度
20161101 已生成速度
20161101 正在计算加速度
20161101 已生成加速度
20161101 正在生成平均网格速度
20161101 已生成网格平均速度
20161101 正在生成网格平均加速度
20161101 已生成网格平均加速度
20161101 正在生成网格浮动车数量
20161101 已生成网格浮动车数量
20161101 正在生成网格车速标准差
20161101 已生成网格车速标准差
20161101 正在生成网格平均停车次数
20161101 已生成网格平均停车次数
20161101 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,45.0,15.571961,-2.323410,1,0.000000,0.0
1,-64.0,69.0,69.0,20.348285,0.060206,7,4.651708,0.0
2,-64.0,69.0,70.0,21.670172,1.438952,6,4.604549,0.0
3,-64.0,69.0,71.0,17.414499,-0.627252,4,5.496352,0.0
4,-64.0,69.0,72.0,21.101985,-1.409720,3,1.235348,0.0
...,...,...,...,...,...,...,...,...
1069818,-132.0,30.0,273.0,3.455001,-0.071978,5,3.310671,4.0
1069819,-84.0,90.0,67.0,3.455001,-0.507051,1,1.460537,4.0
1069820,-132.0,30.0,276.0,3.455001,-0.032002,1,1.223452,4.0
1069821,-132.0,30.0,269.0,5.905002,-0.333561,4,3.072655,4.0


20161101 处理完毕，放入list
正在导入数据：gps_20161102.csv
已导入数据：gps_20161102.csv
20161102 正在生成WGS-84
20161102 已生成WGS-84


0

0

20161102 正在转换原数据坐标


0     [104.0980155493755, 30.711444609444808]
1     [104.09785557939023, 30.71119471333504]
2    [104.09762562478352, 30.710834864836645]
3    [104.09746565742418, 30.710594966547934]
4    [104.09736567893059, 30.710435035428265]
5    [104.09736567893059, 30.710435035428265]
6    [104.09736567893059, 30.710435035428265]
7     [104.09733568510973, 30.71039505235886]
8     [104.09729569418772, 30.71032508293797]
9     [104.09724570530042, 30.71024511767924]
dtype: object

Unnamed: 0,lon,lat
0,104.098016,30.711445
1,104.097856,30.711195
2,104.097626,30.710835
3,104.097466,30.710595
4,104.097366,30.710435
5,104.097366,30.710435
6,104.097366,30.710435
7,104.097336,30.710395
8,104.097296,30.710325
9,104.097246,30.710245


20161102 已转换原数据坐标


0    (11588138.083546756, 3595331.1613828954)
1     (11588120.275769452, 3595298.805159875)
2    (11588094.677339727, 3595252.2125919643)
3    (11588076.869854743, 3595221.1510685496)
4    (11588065.740299746, 3595200.4435680965)
5    (11588065.740299746, 3595200.4435680965)
6    (11588065.740299746, 3595200.4435680965)
7    (11588062.401402881, 3595195.2666608654)
8    (11588057.949633807, 3595186.2072014515)
9    (11588052.384896327, 3595175.8535148907)
dtype: object

20161102 已生成UTM, 当前数据条数： 16779960
20161102 已生成时空索引, 当前数据条数： 16779960
20161102 已处理空值, 当前数据条数： 16779960
20161102 正在计算瞬时速度
20161102 已生成速度
20161102 正在计算加速度
20161102 已生成加速度
20161102 正在生成平均网格速度
20161102 已生成网格平均速度
20161102 正在生成网格平均加速度
20161102 已生成网格平均加速度
20161102 正在生成网格浮动车数量
20161102 已生成网格浮动车数量
20161102 正在生成网格车速标准差
20161102 已生成网格车速标准差
20161102 正在生成网格平均停车次数
20161102 已生成网格平均停车次数
20161102 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,39.0,15.725984,-0.261754,1,0.000000,0.0
1,-64.0,69.0,232.0,16.713385,-0.222366,15,3.237986,0.0
2,-64.0,69.0,233.0,18.026514,0.148975,20,3.907184,0.0
3,-64.0,69.0,235.0,17.441805,0.029500,20,4.475669,0.0
4,-64.0,69.0,236.0,17.048346,0.363370,20,4.287352,0.0
...,...,...,...,...,...,...,...,...
1048119,-98.0,11.0,195.0,6.605736,0.032153,5,2.562818,4.0
1048120,-115.0,42.0,270.0,5.079351,0.005800,2,7.446557,4.0
1048121,-115.0,42.0,271.0,4.948507,-0.040225,4,3.154743,4.0
1048122,-11.0,114.0,218.0,3.416498,-0.056695,1,2.190703,4.0


20161102 处理完毕，放入list
正在导入数据：gps_20161103.csv
已导入数据：gps_20161103.csv
20161103 正在生成WGS-84
20161103 已生成WGS-84


0

0

20161103 正在转换原数据坐标


0    [104.04034630362229, 30.686713205641425]
1    [104.04058563065425, 30.686402794144303]
2     [104.04081498830611, 30.68609240737521]
3    [104.04104434656463, 30.685812008518784]
4    [104.04123381830861, 30.685571684254477]
5     [104.0414232914014, 30.685331361121644]
6    [104.04155293192709, 30.685161143330085]
7    [104.04164268353647, 30.685040993916648]
8    [104.04170251811205, 30.684960894449087]
9    [104.04173243499874, 30.684930840190432]
dtype: object

Unnamed: 0,lon,lat
0,104.040346,30.686713
1,104.040586,30.686403
2,104.040815,30.686092
3,104.041044,30.685812
4,104.041234,30.685572
5,104.041423,30.685331
6,104.041553,30.685161
7,104.041643,30.685041
8,104.041703,30.684961
9,104.041732,30.684931


20161103 已转换原数据坐标


0    (11581718.372475075, 3592129.3776265727)
1     (11581745.014238406, 3592089.196261606)
2     (11581770.54621542, 3592049.0182267413)
3      (11581796.07825997, 3592012.722090164)
4     (11581817.170158029, 3591981.613479707)
5    (11581838.262206236, 3591950.5050931415)
6    (11581852.693723543, 3591928.4714689124)
7      (11581862.684826996, 3591912.91890521)
8    (11581869.345581481, 3591902.5505584637)
9      (11581872.675914073, 3591898.66023551)
dtype: object

20161103 已生成UTM, 当前数据条数： 17039757
20161103 已生成时空索引, 当前数据条数： 17039757
20161103 已处理空值, 当前数据条数： 17039757
20161103 正在计算瞬时速度
20161103 已生成速度
20161103 正在计算加速度
20161103 已生成加速度
20161103 正在生成平均网格速度
20161103 已生成网格平均速度
20161103 正在生成网格平均加速度
20161103 已生成网格平均加速度
20161103 正在生成网格浮动车数量
20161103 已生成网格浮动车数量
20161103 正在生成网格车速标准差
20161103 已生成网格车速标准差
20161103 正在生成网格平均停车次数
20161103 已生成网格平均停车次数
20161103 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,45.0,12.339362,0.053492,2,3.739539,0.0
1,-64.0,49.0,143.0,12.508457,1.578744,1,0.000000,0.0
2,-64.0,49.0,144.0,11.214473,-0.001130,1,1.725317,0.0
3,-64.0,49.0,145.0,14.976904,-1.372631,1,0.000000,0.0
4,-64.0,49.0,147.0,12.077121,0.718560,1,0.431310,0.0
...,...,...,...,...,...,...,...,...
1059561,-79.0,53.0,245.0,4.777468,-0.086574,8,2.319195,4.0
1059562,-79.0,53.0,243.0,5.143713,-0.056621,12,2.558145,4.0
1059563,-79.0,53.0,242.0,3.720115,0.009337,11,1.669069,4.0
1059564,-79.0,53.0,272.0,3.677076,0.072207,12,2.772552,4.0


20161103 处理完毕，放入list
正在导入数据：gps_20161104.csv
已导入数据：gps_20161104.csv
20161104 正在生成WGS-84
20161104 已生成WGS-84


0

0

20161104 正在转换原数据坐标


0    [104.06748524342925, 30.656157542734263]
1    [104.06760503651954, 30.656317322524455]
2    [104.06767491630477, 30.656407195790347]
3     [104.06778472795267, 30.65654699767141]
4     [104.0679244885599, 30.656736741137127]
5    [104.06810418102309, 30.657006401370055]
6    [104.06821399044262, 30.657246161879094]
7    [104.06831381762657, 30.657465943785706]
8     [104.0684236281953, 30.657705705206354]
9    [104.06852345596653, 30.657935483599392]
dtype: object

Unnamed: 0,lon,lat
0,104.067485,30.656158
1,104.067605,30.656317
2,104.067675,30.656407
3,104.067785,30.656547
4,104.067924,30.656737
5,104.068104,30.657006
6,104.068214,30.657246
7,104.068314,30.657466
8,104.068424,30.657706
9,104.068523,30.657935


20161104 已转换原数据坐标


0     (11584739.465435054, 3588174.704413267)
1    (11584752.800740866, 3588195.3806943907)
2    (11584760.579722974, 3588207.0107466527)
3    (11584772.803899704, 3588225.1018341244)
4    (11584788.361979334, 3588249.6556631615)
5    (11584808.365252834, 3588284.5512363357)
6       (11584820.5891815, 3588315.577696345)
7     (11584831.701892786, 3588344.018871974)
8    (11584843.925949382, 3588375.0455974583)
9    (11584855.038726041, 3588404.7805248694)
dtype: object

20161104 已生成UTM, 当前数据条数： 18829835
20161104 已生成时空索引, 当前数据条数： 18829835
20161104 已处理空值, 当前数据条数： 18829835
20161104 正在计算瞬时速度
20161104 已生成速度
20161104 正在计算加速度
20161104 已生成加速度
20161104 正在生成平均网格速度
20161104 已生成网格平均速度
20161104 正在生成网格平均加速度
20161104 已生成网格平均加速度
20161104 正在生成网格浮动车数量
20161104 已生成网格浮动车数量
20161104 正在生成网格车速标准差
20161104 已生成网格车速标准差
20161104 正在生成网格平均停车次数
20161104 已生成网格平均停车次数
20161104 已整理网格特征


Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,-139.0,5.0,46.0,15.788312,1.496082,2,5.456512,0.000000
1,-64.0,5.0,298.0,11.158816,0.717452,3,1.539904,0.000000
2,-64.0,5.0,299.0,9.793044,-0.679573,2,1.867345,0.000000
3,-64.0,5.0,300.0,10.999356,-0.490480,3,3.984289,0.000000
4,-64.0,5.0,301.0,12.738917,-0.598160,4,1.108836,0.000000
...,...,...,...,...,...,...,...,...
1079112,-128.0,34.0,199.0,3.333103,0.005297,6,0.651654,4.055556
1079113,-128.0,34.0,200.0,3.333103,0.014602,6,1.293085,4.055556
1079114,-128.0,34.0,201.0,3.333103,0.004921,6,1.350436,4.055556
1079115,-128.0,34.0,231.0,6.304900,-0.085818,17,4.102796,4.055556


20161104 处理完毕，放入list
正在导入数据：gps_20161105.csv
已导入数据：gps_20161105.csv
20161105 正在生成WGS-84
20161105 已生成WGS-84


0

0

20161105 正在转换原数据坐标


0    [104.04103259407307, 30.725183406126337]
1    [104.04145138858635, 30.725512287152192]
2    [104.04187018924462, 30.725851168996382]
3    [104.04230894065384, 30.726190011657106]
4    [104.04275767118621, 30.726538833476056]
5    [104.04323632713823, 30.726877600427905]
6    [104.04373493763403, 30.727206336009168]
7    [104.04426347808753, 30.727495034197382]
8    [104.04485186867065, 30.727763622149634]
9     [104.04551008746122, 30.72800208718646]
dtype: object

Unnamed: 0,lon,lat
0,104.041033,30.725183
1,104.041451,30.725512
2,104.04187,30.725851
3,104.042309,30.72619
4,104.042758,30.726539
5,104.043236,30.726878
6,104.043735,30.727206
7,104.044263,30.727495
8,104.044852,30.727764
9,104.04551,30.728002


20161105 已转换原数据坐标


0     (11581794.769978592, 3597110.171919934)
1    (11581841.389970558, 3597152.7612025244)
2     (11581888.010646582, 3597196.645718445)
3     (11581936.852230038, 3597240.525314433)
4     (11581986.804684406, 3597285.697356965)
5      (11582040.08842125, 3597329.567461709)
6    (11582095.593487747, 3597372.1386577454)
7     (11582154.430341896, 3597409.525138635)
8      (11582219.929681994, 3597444.30743995)
9     (11582293.202262593, 3597475.188886832)
dtype: object

20161105 已生成UTM, 当前数据条数： 18899337


In [None]:
integrated_feature.drop(['Unnamed: 0'], axis=1)

In [None]:
integrated_feature.to_csv(feature_dst_path, index = None)