# Trajectory to Grids

In [114]:
import pandas as pd

In [115]:
data_dir = '../data/porto-data'
data_name = "porto"
grid_height, grid_width = 0.1, 0.1
boundary = {'min_lat': 41.140092, 'max_lat': 41.185969, 'min_lng': -8.690261, 'max_lng': -8.549155}

In [116]:
def height2lat(height):
    return height / 110.574

def width2lng(width):
    return width / 111.320 / 0.99974

def in_boundary(lat, lng, b):
    return b['min_lng'] < lng < b['max_lng'] and b['min_lat'] < lat < b['max_lat']

In [117]:
trajectories = pd.read_csv("{}/{}.csv".format(data_dir, data_name), header=0, index_col="TRIP_ID")
total_traj_num = len(trajectories)

In [118]:
trajectories

Unnamed: 0_level_0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
TRIP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."
...,...,...,...,...,...,...,...,...
1404171463620000698,C,,,20000698,1404171463,A,False,"[[-8.612469,41.14602],[-8.612487,41.145993],[-..."
1404171367620000670,C,,,20000670,1404171367,A,False,"[[-8.610138,41.140845],[-8.610174,41.140935],[..."
1388745716620000264,C,,,20000264,1388745716,A,False,[]
1404141826620000248,B,,12.0,20000248,1404141826,A,False,"[[-8.630712,41.154885],[-8.63073,41.154813],[-..."


In [119]:
total_traj_num

1710670

In [120]:
# 将 grid size 转为经纬度 size
lat_size, lng_size = height2lat(grid_height), width2lng(grid_width)

# trajectory 的长度限制
shortest, longest = 20, 1200

processed_trajectories = []
for i, (idx, traj) in enumerate(trajectories.iterrows()):
    if i == 100000:
        break
    # 检查进度
    if i % 10000 == 0:
        print("Complete: {}; Total: {}".format(i, total_traj_num))
    # 创建 trajectory grids list
    grid_seq = []
    valid = True
    polyline = eval(traj["POLYLINE"])
    # 检查 trajectory 是否过短或过长
    if shortest <= len(polyline) <= longest:
        for lng, lat in polyline:
            # 检查 trajectory 是否在 boundary 之内
            if in_boundary(lat, lng, boundary):
                # 将 trajectory 中的点从坐标转为 grid coordinate
                grid_i = int((lat - boundary['min_lat']) / lat_size)
                grid_j = int((lng - boundary['min_lng']) / lng_size)
                grid_seq.append((grid_i, grid_j))
            else:
                valid = False
                break
        # 如果 trajectory 的所有点 都在 boundary 之内，加入到 processed_trajectories
        if valid:
            processed_trajectories.append(grid_seq)

print("Valid trajectory num:", len(processed_trajectories))

Complete: 0; Total: 1710670
Complete: 10000; Total: 1710670
Complete: 20000; Total: 1710670
Complete: 30000; Total: 1710670
Complete: 40000; Total: 1710670
Complete: 50000; Total: 1710670
Complete: 60000; Total: 1710670
Complete: 70000; Total: 1710670
Complete: 80000; Total: 1710670
Complete: 90000; Total: 1710670
Valid trajectory num: 66248


In [121]:
lat_grid_num = int((boundary['max_lat'] - boundary['min_lat']) / lat_size) + 1
lng_grid_num = int((boundary['max_lng'] - boundary['min_lng']) / lng_size) + 1

print("Grid size:", (lat_grid_num, lng_grid_num))

Grid size: (51, 158)


In [125]:
# 将 处理好的 trajectory 储存到文件中
fout = open("{}/processed_{}.csv".format(data_dir, data_name), 'w')
for traj in processed_trajectories:
    fout.write("[")
    # 将每个 grid coordinate 转换为 i * lng_grid_num + j 的 grid index
    for i, j in traj[:-1]:
        fout.write("%s, " % str(i * lng_grid_num + j))
    fout.write("%s]\n" % str(traj[-1][0] * lng_grid_num + traj[-1][1]))

# Source and Destination

In [123]:
from collections import defaultdict

In [128]:
data_dir = '../data/porto-data'
data_name = "porto"
min_sd_traj_num = 25
test_traj_num = 5

f = open("{}/processed_{}.csv".format(data_dir, data_name), 'r').readlines()
# 创建 source-destination 的 dictionary， value 为所有在这之间的 trajectory grid index
sd_cnt = defaultdict(list)
for eachline in f:
    traj = eval(eachline)
    s, d = traj[0], traj[-1]
    sd_cnt[(s, d)].append(eachline)

fout_train = open("{}/processed_{}_train.csv".format(data_dir, data_name), 'w')
fout_test = open("{}/processed_{}_val.csv".format(data_dir, data_name), 'w')
for trajs in sd_cnt.values():
    # 大于最少数量的 source-destination 加入
    if len(trajs) >= min_sd_traj_num:
        # 训练集为 n-5 个，validation set 为 5 个
        train_trajs, test_trajs = trajs[:-test_traj_num], trajs[-test_traj_num:]
        for traj in train_trajs:
            fout_train.write(traj)
        for traj in test_trajs:
            fout_test.write(traj)