In [2]:
import numpy as np
import json
import pandas as pd
from utils import Timer
from joblib import Parallel, delayed
from traj2grid import Traj2Grid
import traj_dist.distance as tdist
from parameters import *
import numpy as np


#### 读取数据


In [3]:
timer = Timer()

file_path = "data/full/gps_20161101"
dict_path = "data/str_grid2idx_400_44612.json"
nrows = 2000000
vocab_size = 400


# read data
timer.tik("read data")
df = pd.read_csv(file_path, names=["name", "id", "time", "lon", "lat"],
        usecols=["id", "lon", "lat"], nrows=nrows)
timer.tok("read {}".format(file_path))


read data start
read data/full/gps_20161101 done, 1.181s after read data start


1.1805870532989502

#### 去除超出范围的数据点


In [4]:
pad = 0.002

l = len(df)
df = df[(df["lon"] > 104.04214 + pad) & (df["lon"] < 104.12958 - pad)]
df = df[(df["lat"] > 30.65294 + pad) & (df["lat"] < 30.72775 - pad)]
print(f"剩{len(df)}/{l}个点，筛掉{round(100 - 100 * len(df) / l)}%")


剩1895778/2000000个点，筛掉5%


#### GroupBy转换为1维点列

In [5]:
str_grid2idx = json.load(open(dict_path))
t2g = Traj2Grid(row_num, column_num, min_lon, min_lat, max_lon, max_lat)
grid2idx = {eval(g): str_grid2idx[g] for g in list(str_grid2idx)}
t2g.set_vocab(grid2idx)
timer.tok(f"load dict{dict_path}")


def group_concat(group: pd.DataFrame):
    origin_traj = [((row["lon"]), row["lat"])
                   for index, row in group.iterrows()]
    traj_1d, coord_traj = t2g.convert1d(origin_traj)
    series = pd.Series({
        "origin_trajs": coord_traj,
        "trajs": traj_1d,
        "len": len(traj_1d),
        "max_lon": group["lon"].max(),
        "max_lat": group["lat"].max(),
        "min_lon": group["lon"].min(),
        "min_lat": group["lat"].min(),
    })
    return series

res = Parallel(n_jobs=44)(delayed(group_concat)(group)for name, group in df.groupby("id"))
df = pd.DataFrame(res)
timer.tok("group-apply")


load dictdata/str_grid2idx_400_44612.json done, 15.079s after read data start
group-apply done, 113.32s after read data start


113.320148229599

#### 过滤0长度轨迹

In [6]:
dff = df[(df["len"] > 0)]
print(f"剩{len(dff)}/{len(df)}条轨迹，筛掉{round(100 - 100 * len(dff) / len(df))}%")

剩10955/10955条轨迹，筛掉0%


#### 生成pair-wise轨迹距离矩阵


In [7]:
# dff = dff.reset_index()
origin_trajs = dff["origin_trajs"].to_list()
arr = [np.array(origin_traj) for origin_traj in origin_trajs]
length = len(arr)
dis_matrix = np.zeros((length, length))
dis_func = getattr(tdist, "discret_frechet")


def cal_dis(i, j, x, y, n):
    dis = dis_func(x, y)
    if i == j + 1 and i % 100 == 1:
        timer.tok(f'{i}-{round((i * i) / (n * n) * 100, 2)}%')
    return i, j, dis

res = Parallel(n_jobs=44)(
    delayed(cal_dis)(i, j, arr[i], arr[j], length - 1) for i in range(length) for j in range(i))
timer.tok("calculate distance")
for (i, j, dis) in res:
    dis_matrix[i,j] = dis
    dis_matrix[j,i] = dis


1-0.0% done, 6698.401s after read data start
101-0.01% done, 6699.247s after read data start
201-0.03% done, 6699.943s after read data start
301-0.08% done, 6700.931s after read data start
401-0.13% done, 6702.526s after read data start
501-0.21% done, 6704.544s after read data start
601-0.3% done, 6706.832s after read data start
701-0.41% done, 6709.584s after read data start
801-0.53% done, 6712.513s after read data start
901-0.68% done, 6715.818s after read data start
1001-0.84% done, 6719.786s after read data start
1101-1.01% done, 6724.256s after read data start
1201-1.2% done, 6728.53s after read data start
1301-1.41% done, 6734.16s after read data start
1401-1.64% done, 6740.227s after read data start
1501-1.88% done, 6747.025s after read data start
1601-2.14% done, 6753.494s after read data start
1701-2.41% done, 6760.741s after read data start
1801-2.7% done, 6767.807s after read data start
1901-3.01% done, 6775.829s after read data start
2001-3.34% done, 6784.131s after read 

#### 生成 Train Dataset 第六步：保存

In [8]:
full = True

file_name = file_path.split("/")[-1]
save_path = "data/test/"
file_path = save_path + file_name
origin_trajs = dff["origin_trajs"].to_list()
dict_save = {'trajs': dff["trajs"].to_list(), 'origin_trajs': origin_trajs}
if full:
    dict_save["dis_matrix"] = dis_matrix.tolist()
    json.dump(dict_save, open(file_path + f"_{len(origin_trajs)}_{vocab_size}_dataset_full.json", "w"))
else:
    json.dump(dict_save, open(file_path + f"_{len(origin_trajs)}_{vocab_size}_dataset_small.json", "w"))
timer.tok("save")


save done, 13763.64s after read data start


13763.640017032623