In [15]:
import numpy as np
import json
import pandas as pd
from utils import Timer
from joblib import Parallel, delayed
from traj2grid import Traj2Grid
import traj_dist.distance as tdist
from parameters import *
import numpy as np


#### 读取数据


In [16]:
timer = Timer()

file_path = "data/full/gps_20161102"
dict_path = "data/str_grid2idx_400_44612.json"
nrows = 1000000
vocab_size = 400


# read data
timer.tik("read data")
df = pd.read_csv(file_path, names=["name", "id", "time", "lon", "lat"],
        usecols=["id", "lon", "lat"], nrows=nrows)
timer.tok("read {}".format(file_path))


read data start
read data/full/gps_20161102 done, 0.629s after read data start


0.6286721229553223

#### 去除超出范围的数据点


In [17]:
pad = 0.002

l = len(df)
df = df[(df["lon"] > 104.04214 + pad) & (df["lon"] < 104.12958 - pad)]
df = df[(df["lat"] > 30.65294 + pad) & (df["lat"] < 30.72775 - pad)]
print(f"剩{len(df)}/{l}个点，筛掉{round(100 - 100 * len(df) / l)}%")


剩947007/1000000个点，筛掉5%


#### GroupBy转换为1维点列

In [18]:
str_grid2idx = json.load(open(dict_path))
t2g = Traj2Grid(row_num, column_num, min_lon, min_lat, max_lon, max_lat)
grid2idx = {eval(g): str_grid2idx[g] for g in list(str_grid2idx)}
t2g.set_vocab(grid2idx)
timer.tok(f"load dict{dict_path}")


def group_concat(group: pd.DataFrame):
    origin_traj = [((row["lon"]), row["lat"])
                   for index, row in group.iterrows()]
    traj_1d, coord_traj = t2g.convert1d(origin_traj)
    series = pd.Series({
        "origin_trajs": coord_traj,
        "trajs": traj_1d,
        "len": len(traj_1d),
        "max_lon": group["lon"].max(),
        "max_lat": group["lat"].max(),
        "min_lon": group["lon"].min(),
        "min_lat": group["lat"].min(),
    })
    return series


# group-apply
# res = []
# for name, group in df.groupby("id"):
#     res.append(group_concat(group))
res = Parallel(n_jobs=44)(delayed(group_concat)(group)for name, group in df.groupby("id"))
df = pd.DataFrame(res)
timer.tok("group-apply")


load dictdata/str_grid2idx_400_44612.json done, 1.142s after read data start
group-apply done, 53.034s after read data start


53.033665895462036

##### 看看数据情况

In [19]:
df.head()

Unnamed: 0,origin_trajs,trajs,len,max_lon,max_lat,min_lon,min_lat
0,"[(104.04445, 30.69332), (104.04498, 30.69353),...","[1048, 1322, 1572, 1835, 1976, 2254, 2395, 252...",64,104.06135,30.69465,104.04445,30.69332
1,"[(104.05396, 30.66463), (104.05384, 30.66468),...","[6817, 6668, 6817, 6668, 6548, 6432, 6308, 616...",60,104.05748,30.67199,104.0518,30.66461
2,"[(104.10496, 30.655), (104.10526, 30.65519), (...","[34437, 34535, 34644, 41581, 41719, 41845, 420...",31,104.12569,30.65942,104.10496,30.655
3,"[(104.04668, 30.65522), (104.0465, 30.65552), ...","[2315, 2172, 2173, 2043, 1913, 1914, 1770, 163...",15,104.04668,30.6581,104.04424,30.65522
4,"[(104.12722, 30.71598), (104.12655, 30.71609),...","[44007, 43799, 43590, 43402, 43080, 42824, 426...",73,104.12722,30.72573,104.05997,30.71598


#### 过滤过长过短轨迹

In [20]:
dff = df[(df["len"] > 0) & (df["len"] < 999)]
print(f"剩{len(dff)}/{len(df)}条轨迹，筛掉{round(100 - 100 * len(dff) / len(df))}%")


剩5507/5507条轨迹，筛掉0%


#### 生成pair-wise轨迹距离矩阵


In [21]:
# dff = dff.reset_index()
origin_trajs = dff["origin_trajs"].to_list()
arr = [np.array(origin_traj) for origin_traj in origin_trajs]
length = len(arr)
dis_matrix = np.zeros((length, length))
dis_func = getattr(tdist, "discret_frechet")


def cal_dis(i, j, x, y, n):
    dis = dis_func(x, y)
    if i == j + 1 and i % 100 == 1:
        timer.tok(f'{i}-{round((i * i) / (n * n) * 100, 2)}%')
    return i, j, dis

res = Parallel(n_jobs=44)(
    delayed(cal_dis)(i, j, arr[i], arr[j], length - 1) for i in range(length) for j in range(i))
timer.tok("calculate distance")
for (i, j, dis) in res:
    dis_matrix[i,j] = dis
    dis_matrix[j,i] = dis


1-0.0% done, 53.636s after read data start
101-0.03% done, 54.108s after read data start
201-0.13% done, 54.865s after read data start
301-0.3% done, 55.889s after read data start
401-0.53% done, 57.179s after read data start
501-0.83% done, 58.637s after read data start
601-1.19% done, 60.862s after read data start
701-1.62% done, 63.743s after read data start
801-2.12% done, 66.083s after read data start
901-2.68% done, 69.181s after read data start
1001-3.31% done, 72.728s after read data start
1101-4.0% done, 76.551s after read data start
1201-4.76% done, 80.975s after read data start
1301-5.58% done, 85.531s after read data start
1401-6.47% done, 90.04s after read data start
1501-7.43% done, 95.222s after read data start
1601-8.45% done, 100.961s after read data start
1701-9.54% done, 106.687s after read data start
1801-10.7% done, 112.859s after read data start
1901-11.92% done, 119.464s after read data start
2001-13.21% done, 126.313s after read data start
2101-14.56% done, 133.

In [22]:
dis_matrix

array([[0.        , 0.03022509, 0.07414426, ..., 0.07057931, 0.01724014,
        0.02899181],
       [0.03022509, 0.        , 0.06970381, ..., 0.06220493, 0.04363249,
        0.02230574],
       [0.07414426, 0.06970381, 0.        , ..., 0.0285858 , 0.09136432,
        0.08519888],
       ...,
       [0.07057931, 0.06220493, 0.0285858 , ..., 0.        , 0.0753164 ,
        0.05807618],
       [0.01724014, 0.04363249, 0.09136432, ..., 0.0753164 , 0.        ,
        0.03990596],
       [0.02899181, 0.02230574, 0.08519888, ..., 0.05807618, 0.03990596,
        0.        ]])

#### 生成 Train Dataset 第六步：保存

In [23]:

file_name = file_path.split("/")[-1]
save_path = "data/test/"
file_path = save_path + file_name
sorted_index = np.argsort(dis_matrix, axis=1)
dict_save = {'trajs': dff["trajs"].to_list(), 'origin_trajs': origin_trajs}
dict_save["dis_matrix"] = dis_matrix.tolist()
json.dump(dict_save, open(file_path + f"_{sorted_index.shape[0]}_{vocab_size}_dataset_full.json", "w"))
timer.tok("save")


save done, 686.134s after read data start


686.134658575058