# Read Data

In [1]:
import pandas as pd
import torch
from torch.nn import functional as F

In [2]:
trajectories = pd.read_csv("../data/porto-data/processed_small_porto_with_cp.csv")
trajectories

Unnamed: 0,POLYLINE,CP_INDEX
0,"[[-8.618643, 41.141412], [-8.618499, 41.141376...","[0, 3, 6, 8, 11, 13, 15, 17, 19, 20, 22]"
1,"[[-8.612964, 41.140359], [-8.613378, 41.14035]...","[0, 3, 6, 9, 11, 13, 16, 18, 21, 24, 27, 29, 3..."
2,"[[-8.574678, 41.151951], [-8.574705, 41.151942...","[0, 3, 6, 8, 10, 12, 14, 16, 18, 21, 23, 26, 2..."
3,"[[-8.645994, 41.18049], [-8.645949, 41.180517]...","[0, 2, 4, 6, 8, 10, 12, 15, 17, 19, 21, 24, 26..."
4,"[[-8.615502, 41.140674], [-8.614854, 41.140926...","[0, 2, 5, 8, 11, 14, 16, 18, 20, 22, 25]"
...,...,...
89006,"[[-8.649297, 41.154309], [-8.650044, 41.154228...","[0, 2, 5, 8, 11, 14, 16, 19, 21, 23, 26, 28, 3..."
89007,"[[-8.6139, 41.141133], [-8.613891, 41.141115],...","[0, 3, 5, 8, 11, 13, 15, 17, 20, 23, 26, 29, 3..."
89008,"[[-8.606475, 41.144508], [-8.60652, 41.144517]...","[0, 3, 5, 7, 9, 11, 13, 16, 19, 22, 24, 26, 29..."
89009,"[[-8.604945, 41.149692], [-8.605368, 41.149773...","[0, 2, 4, 6, 8, 10, 13, 15, 18, 22, 24, 26, 28..."


# Data Preprocess

In [37]:
grid_height, grid_width = 0.1, 0.1
boundary = {'min_lat': 41.140092, 'max_lat': 41.185969, 'min_lng': -8.690261, 'max_lng': -8.549155}

def height2lat(height):
    return height / 110.574


def width2lng(width):
    return width / 111.320 / 0.99974


def in_boundary(lat, lng, b):
    return b['min_lng'] < lng < b['max_lng'] and b['min_lat'] < lat < b['max_lat']


def data_preprocess():
    lat_size, lng_size = height2lat(grid_height), width2lng(grid_width)

    lat_grid_num = int((boundary['max_lat'] - boundary['min_lat']) / lat_size) + 1
    lng_grid_num = int((boundary['max_lng'] - boundary['min_lng']) / lng_size) + 1

    trajectories = pd.read_csv("../data/porto-data/processed_small_porto_with_cp.csv")
    processed_trajectories = []
    processed_subtrajectories = []

    total_traj_num = len(trajectories)
    for i, (idx, traj) in enumerate(trajectories.iterrows()):

        if i % 10000 == 0:
            print("Complete: {}; Total: {}".format(i, total_traj_num))

        grid_seq = []
        valid = True
        polyline = eval(traj["POLYLINE"])
        for lng, lat in polyline:
            if in_boundary(lat, lng, boundary):
                grid_i = int((lat - boundary['min_lat']) / lat_size)
                grid_j = int((lng - boundary['min_lng']) / lng_size)
                grid_seq.append((grid_i, grid_j))
            else:
                valid = False
                break
        grid_seq_subtraj = []
        if valid:
            processed_trajectories.append(grid_seq)
            cp_index = eval(traj[" CP_INDEX"])
            for index in range(len(cp_index)-1):
                cp_start_index = cp_index[index]
                cp_end_index = cp_index[index+1]
                subtraj_gird = grid_seq[cp_start_index: cp_end_index]
                processed_subtrajectories.append(subtraj_gird)
    print("Grid size:", (lat_grid_num, lng_grid_num))
    print("Total valid trajectory number:", len(processed_trajectories))
    print("Total subtrajectory number:", len(processed_subtrajectories))

    fout = open("../data/porto-data/processed_small_porto_subtrajectory.csv", 'w')
    for traj in processed_subtrajectories:
        fout.write("[")
        for i, j in traj[:-1]:
            fout.write("%s, " % str(i * lng_grid_num + j))
        fout.write("%s]\n" % str(traj[-1][0] * lng_grid_num + traj[-1][1]))
    fout.close()

    return processed_trajectories, processed_subtrajectories

In [38]:
processed_trajectories, processed_subtrajectories = data_preprocess()

Complete: 0; Total: 89011
Complete: 10000; Total: 89011
Complete: 20000; Total: 89011
Complete: 30000; Total: 89011
Complete: 40000; Total: 89011
Complete: 50000; Total: 89011
Complete: 60000; Total: 89011
Complete: 70000; Total: 89011
Complete: 80000; Total: 89011
Grid size: (51, 158)
Total valid trajectory number: 66247
Total subtrajectory number: 1257246


In [40]:
processed_trajectories[0]

[(1, 79),
 (1, 79),
 (2, 77),
 (4, 75),
 (4, 73),
 (5, 70),
 (5, 69),
 (5, 66),
 (7, 64),
 (8, 65),
 (11, 67),
 (12, 68),
 (12, 68),
 (13, 68),
 (13, 68),
 (13, 68),
 (13, 66),
 (14, 64),
 (15, 65),
 (15, 66),
 (15, 66),
 (15, 66),
 (15, 66)]

In [None]:
processed_subtrajectories