In [None]:

# df2 = pd.read_csv('F:/大学/第40期PRP/特征提取/1_feature_analysis/' + 'Intergated-DATASET-C' + '.csv')

In [7]:
import os
import numpy as np
import pandas as pd
import time
from utm import *
from tqdm import tqdm, tqdm_pandas
from osgeo import osr
import coordTransform

# 预设地址和其他全局变量
feature_file_name = 'Intergated-DATASET-D'
raw_data_path = 'D:\\Working\\PRP\\Data\\Raw_Data\\'
feature_dst_path = 'D:\\Working\\PRP\\Data\\Processed_Data\\' + feature_file_name + '.csv'
day_begin = '06:00:00'
day_end = '23:00:00'

#在此处设置时间窗(单位为3秒)和空间网格的边长(WGS84坐标系)
time_interval = 200
space_interval = 70
# 滞留时间阈值，超过阈值视为无效订单

# 设置时间区间 读取原数据
# 时间区间: 减少单次的处理量
jar = []

for date in range(20161108,20161116):
    if not os.path.exists('D:\\Working\\PRP\\Data\\Processed_Data\\' + f'feature_{date}.csv'):

        time1 = f'{date} {day_begin}'
        time2 = f'{date} {day_end}'
        stamp1 = time.mktime(time.strptime(time1, '%Y%m%d %H:%M:%S'))
        stamp2 = time.mktime(time.strptime(time2, '%Y%m%d %H:%M:%S'))

        print(f'正在导入数据：gps_{date}.csv')
        #导入原地理数据
        df = pd.read_csv(raw_data_path+f'gps_{date}.csv', header = None) #注意我此处使用的是移动硬盘的地址
        df.columns = ['driver_ID', 'order_ID', 'timestamp', 'lon', 'lat']
        df.timestamp = df.timestamp + 8*3600
        print(f'已导入数据：gps_{date}.csv')
        ## 只取预设时间区间内的数据
        df = df[(df['timestamp'] >= stamp1)&(df['timestamp'] < stamp2)].reset_index(drop = True)

        # 将空间坐标转换为WGS-84(耗时会很长)
        xy = df[['lon','lat']].apply(lambda x: coordTransform.gcj02_to_wgs84(x[0],x[1])[:2], axis = 1)
        df['lon'] = [x[0] for x in xy]
        df['lat'] = [x[1] for x in xy]
        print (f'{date} 已生成WGS-84')

        # 再把WGS-84转换为UTM平面直角系(保留WGS-84数据)
        wgs84 = osr.SpatialReference()
        wgs84.ImportFromEPSG(4326)
        # 2.Pseudo-Mercator
        inp = osr.SpatialReference()
        inp.ImportFromEPSG(3857)
        # 3.定义坐标变换映射
        transformation = osr.CoordinateTransformation(wgs84, inp)
        # 4.转换原数据的坐标
        xy = df[['lon','lat']].apply(lambda x: transformation.TransformPoint(x[0],x[1])[:2], axis = 1)
        # 5.写入df
        df['x'] = [x[0] for x in xy]
        df['y'] = [x[1] for x in xy]
        print (f'{date} 已生成UTM, 当前数据条数：',len(df))

        # 时间窗划分
        df['time_id'] = df.timestamp.apply(lambda x: (x - stamp1)//time_interval)

        # 空间网格划分
        # 1.计算左边界和上边界，左右-x， 上下-y
        left = df['x'].min()
        down = df['y'].max()

        # 2.生成横向和纵向索引
        df['row_id'] = df['y'].apply(lambda y: (y - down)//space_interval)
        df['col_id'] = df['x'].apply(lambda x: (x - left)//space_interval)

        print (f'{date} 已生成时空索引')

        df = df.dropna()
        print (f'{date} 已处理空值, 当前数据条数：',len(df))

        # 下面开始时空特征提取

        #1. 计算瞬时速度
        # 排序：先按司机排，同司机按订单排，同订单再按时间排
        df = df.sort_values(by = ['driver_ID', 'order_ID', 'timestamp']).reset_index(drop = True)
        # 将订单id下移一行，用于判断前后数据是否属于同一订单
        df['orderFlag'] = df['order_ID'].shift(1)
        df['identi'] = (df['orderFlag'] == df['order_ID']) #一个由boolean构成的列，方便后面所有shift完成了之后再删除分界行
        # 将坐标，时间戳下移一行，匹配相应轨迹点
        df['x1'] = df['x'].shift(1)
        df['y1'] = df['y'].shift(1)
        df['timestamp1'] = df['timestamp'].shift(1)
        # 将不属于同一订单的轨迹点删除
        df = df[df['identi'] == True]
        # 计算相邻轨迹点之间的距离和相差时间
        # 距离采用欧式距离
        dist = np.sqrt(np.square(df['x'].values - df['x1'].values) + np.square(df['y'].values - df['y1'].values))
        ttime = df['timestamp'].values - df['timestamp1'].values
        # 计算速度
        df['speed'] = dist/ttime
        # 删除临时数据
        df = df.drop(columns = ['x1', 'y1', 'orderFlag', 'timestamp1', 'identi'])
        print(f'{date} 已生成速度')

        # 2.计算瞬时加速度
        df['speed1'] = df['speed'].shift(1)
        df['timestamp1'] = df['timestamp'].shift(1)
        df['identi'] = df['order_ID'].shift(1)
        df = df[df.identi == df.order_ID]
        df['acc'] = (df.speed - df.speed1)/(df.timestamp - df.timestamp1)
        df = df.drop(columns = ['speed1', 'timestamp1', 'identi'])
        print(f'{date} 已生成加速度')

        df = df.reset_index(drop = True)

        # 下面计算集体/网格平均特征

        # 1. 网格平均速度：先求每辆车在网格中的平均速度，然后求网格中所有个体平均速度的军制
        # 基于时空网格和估计id分组
        orderGrouped = df.groupby(['row_id', 'col_id', 'time_id', 'order_ID'])
        # 网格在每个时刻（时间窗）的平均速度
        grouped_speed = orderGrouped.speed.mean().reset_index()
        grouped_speed = grouped_speed.groupby(['row_id', 'col_id', 'time_id'])
        grid_speed = grouped_speed.speed.mean()
        # 去除异常值
        grid_speed = grid_speed.clip(grid_speed.quantile(0.05), grid_speed.quantile(0.95))
        print(f'{date} 已生成网格平均速度')

        # 2. 网格平均加速度
        gridGrouped = df.groupby(['row_id', 'col_id', 'time_id'])
        grid_acc = gridGrouped.acc.mean()
        print(f'{date} 已生成网格平均加速度')

        # 3.网格浮动车流量
        grouped_volume = orderGrouped.speed.last().reset_index() #每个时空网格中的每个order只保留一辆（用last（）来取）
        grouped_volume = grouped_volume.groupby(['row_id', 'col_id', 'time_id'])
        grid_volume = grouped_volume['speed'].size()
        grid_volume = grid_volume.clip(grid_volume.quantile(0.05), grid_volume.quantile(0.95))
        print(f'{date} 已生成网格浮动车数量')

        # 4.网格车速标准差
        grid_v_std = gridGrouped.speed.std(ddof=0)
        # 去除异常值
        grid_v_std = grid_v_std.clip(grid_v_std.quantile(0.05), grid_v_std.quantile(0.95))
        print(f'{date} 已生成网格车速标准差')

        # 5.网格平均停车次数
        stopNum = gridGrouped.speed.agg(lambda x: (x==0).sum())
        grid_stop = pd.concat((stopNum, grid_volume), axis = 1)
        grid_stop['stopNum'] = stopNum.values/ grid_volume.values
        grid_stop = grid_stop['stopNum']
        grid_stop = grid_stop.clip(0, grid_stop.quantile(0.95))
        print(f'{date} 已生成网格平均停车次数')

        feature = pd.concat([grid_speed, grid_acc, grid_volume, grid_v_std, grid_stop], axis = 1).reset_index()
        feature.columns = ['row_id','col_id', 'time_id', 'aveSpeed', 'gridAcc', 'volume', 'speedStd', 'stopNum']
        print(f'{date} 已整理网格特征')
        feature.sort_values(['stopNum']).reset_index(drop=True)
        feature['date'] = date

        jar.append(feature)
        feature.to_csv('D:\\Working\\PRP\\Data\\Processed_Data\\' + f'feature_{date}.csv')
        print(f'{date} 处理完毕，放入list')
    
    else:
        feature = df.read_csv('D:\\Working\\PRP\\Data\\Processed_Data\\' + f'feature_{date}.csv')
        print(f'feature_{date} 处理完毕，放入list')

integrated_feature = pd.concat(jar, axis=0)
integrated_feature.to_csv(feature_dst_path, index = None)
print(f'已生成{feature_file_name}')


正在导入数据：gps_20161108.csv
已导入数据：gps_20161108.csv
20161108 已生成WGS-84
20161108 已生成UTM, 当前数据条数： 17848685
20161108 已生成时空索引
20161108 已处理空值, 当前数据条数： 0
20161108 已生成速度
20161108 已生成加速度
20161108 已生成网格平均速度
20161108 已生成网格平均加速度
20161108 已生成网格浮动车数量
20161108 已生成网格车速标准差
20161108 已生成网格平均停车次数
20161108 已整理网格特征
20161108 处理完毕，放入list
正在导入数据：gps_20161109.csv


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Working\\PRP\\Data\\Raw_Data\\gps_20161109.csv'