用于对原始数据做基本的筛选和格式化

In [1]:
import pandas as pd
from utils import Timer
from joblib import Parallel, delayed

timer = Timer()

In [2]:
def data_reformat(file_path):
    timer.tik("read")
    df = pd.DataFrame(pd.read_csv(file_path, header=None))
    df.columns = ['name', 'order_id', 'time', 'lon', 'lat']  # lon经度 lat纬度
    timer.tok()

    def group_concat(name, x: pd.DataFrame):
        traj_str = ""
        for index, row in x.iterrows():
            traj_str += "({} {}),".format(row['lon'], row['lat'])
        traj_str = traj_str[:-1]
        series = pd.Series({'order_id': name,
                            'traj': traj_str,
                            'len': len(x),
                            "max_time_diff": x['time'].diff().max(),
                            'max_lon_diff': x['lon'].diff().max(),
                            'max_lat_diff': x['lat'].diff().max()})
        return series

    def applyParallel(df_groups, func, n=6):
        res = Parallel(n_jobs=n)(delayed(func)(name, group) for name, group in df_groups)
        return pd.DataFrame(res)

    # group-apply
    timer.tik("group-apply")
    group_df = applyParallel(df.groupby("order_id"), group_concat)
    timer.tok()

    # filter
    t_diff_limit = 20
    lon_lat_diff_limit = 0.005
    f_group = group_df[(group_df['max_time_diff'] < t_diff_limit) & (
            group_df['max_lon_diff'] + group_df['max_lat_diff'] < lon_lat_diff_limit)]
    f_group = f_group[['order_id', 'traj', 'len']]
    f_group = f_group.set_index('order_id')
    print("剩{}/{}条，筛掉{}%".format(len(f_group), len(group_df), round(100 - 100 * len(f_group) / len(group_df))))

    # save
    f_group.to_csv(file_path + "_format")
    print("done\n")

In [5]:
data_reformat("data/1m_gps_20161101")
# for i in range(16, 31):
#     day = str(i).zfill(2)
#     print("day:{}".format(day))
#     file_path = r'data/cxwang@mail.xjtu.edu.cn_201611{}/gps_201611{}'.format(day, day)
#     data_reformat(file_path)

read start
read done:1.1561s
group-apply start
group-apply done:13.5368s
剩4142/5650条，筛掉27%
done

