In [1]:
""" Libraries """
import os
import json
import time as tm
import pyproj
import pandas as pd
from tqdm.auto import tqdm
import nbimporter
from Core import json_numpy_serializer

pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.set_option('display.max_rows', None)
tqdm.pandas(desc="Processing time")

In [2]:
""" Parameters """
para = {
    "fold_test": "test",
    "fold_data": "data",
    "fold_para": "para",
    "geo_crs": "EPSG:4326",
    "pro_crs": "EPSG:4549",
    "latitude_0": 22.2,
    "latitude_n": 22.4,
    "longitude_0": 113.4,
    "longitude_n": 113.6,
    "time_0": "2021-10-14 00:00:00",
    "time_n": "2021-10-15 00:00:00",
}

fold_test = para["fold_test"]
fold_para = para["fold_para"]
geo_crs = para["geo_crs"]
pro_crs = para["pro_crs"]
latitude_0, longitude_0, time_0 = para["latitude_0"], para["longitude_0"], para["time_0"]
latitude_n, longitude_n, time_n = para["latitude_n"], para["longitude_n"], para["time_n"]

# Delete existing para files
if os.path.exists(fold_para):
    files = os.listdir(fold_para)
    for file in files:
        os.remove("{}/{}".format(fold_para, file))  # Remove all files
else:
    os.makedirs(fold_para)

In [3]:
""" Data Sampling """
# Delete existing test file
if os.path.exists("test.csv"):
    os.remove("test.csv")
else:
    pass

# Sample first n taxi
df = pd.read_csv(filepath_or_buffer="taxi.csv", sep=",", header=None)
df.rename(columns={0: "taxiId", 1: "time", 2: "latitude", 3: "longitude", 4: "state"}, inplace=True)
groups = df.groupby("taxiId")
for i, (taxiId, group) in tqdm(enumerate(groups)):
    path = "test.csv"
    group.to_csv(path_or_buf=path, index=False, header=not os.path.exists(path), mode="a")
    if i >= 1006 - 1:
        break

0it [00:00, ?it/s]

In [4]:
""" Re-encoding Functions """
# Projection Transformer
transformer = pyproj.Transformer.from_crs(pyproj.CRS(geo_crs), pyproj.CRS(pro_crs), always_xy=True)


# Re-encode the space
def reEncode_space(longitude, latitude):
    x, y = transformer.transform(longitude, latitude)
    return x, y


# Re-encode the time
def reEncode_time(time):
    t = tm.mktime(tm.strptime(time, "%Y-%m-%d %H:%M:%S"))
    return t


# Re-encode the state
def reEncode_state(state):
    s = 1 if state == 3 else 0
    return s


# Re-encode the all
def reEncode_combine(longitude, latitude, time, state):
    x, y = reEncode_space(longitude, latitude)
    t = reEncode_time(time)
    s = reEncode_state(state)
    return int(x), int(y), int(t), int(s)


# Space extent
x_0, y_0 = reEncode_space(longitude_0, latitude_0)
x_n, y_n = reEncode_space(longitude_n, latitude_n)
# Time extent
t_0 = reEncode_time(time_0)
t_n = reEncode_time(time_n)

In [5]:
""" Data Re-encoding and Reorganizing """
# Delete existing test files
if os.path.exists(fold_test):
    files = os.listdir(fold_test)
    for file in files:
        os.remove("{}/{}".format(fold_test, file))  # Remove all files
else:
    os.makedirs(fold_test)

# Re-encode, filter and reorganize the data
df = pd.read_csv(filepath_or_buffer="test.csv", sep=",", iterator=True, chunksize=50000)
for chunk in tqdm(df):
    chunk[["x", "y", "t", "s"]] = chunk.apply(lambda x: reEncode_combine(x["longitude"], x["latitude"], x["time"], x["state"]), axis=1, result_type='expand')
    chunk = chunk[(chunk['x'] >= x_0) & (chunk['x'] <= x_n) & (chunk['y'] >= y_0) & (chunk['y'] <= y_n)]
    chunk = chunk[(chunk['t'] >= t_0) & (chunk['t'] <= t_n)]
    chunk_groups = chunk.groupby("taxiId")
    for taxiId, chunk_group in chunk_groups:
        path = "{}/{}.csv".format(fold_test, taxiId)
        chunk_group.to_csv(path_or_buf=path, index=False, header=not os.path.exists(path), mode="a", columns=['x', 'y', 't', 's'])

0it [00:00, ?it/s]

In [6]:
""" Data Cleaning """

# Drop na and duplicates
files = os.listdir(fold_test)
for file in tqdm(files):
    df = pd.read_csv(filepath_or_buffer="{}/{}".format(fold_test, file), sep=",")
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    if len(df) <= 1:
        os.remove("{}/{}".format(fold_test, file))
    else:
        df.to_csv(path_or_buf="{}/{}".format(fold_test, file), index=False, mode='w')

  0%|          | 0/1000 [00:00<?, ?it/s]

In [7]:
""" Min and Max Calculating """

# Initialize min and max
t_min, x_min, y_min = float("inf"), float("inf"), float("inf")
x_max, y_max, t_max = float("-inf"), float("-inf"), float("-inf")

# Calculate min and max
files = os.listdir(fold_test)
for file in tqdm(files):
    df = pd.read_csv(filepath_or_buffer="{}/{}".format(fold_test, file), sep=",")
    # Update minimums
    x_min_tmp, y_min_tmp, t_min_tmp = df["x"].min(), df["y"].min(), df["t"].min()
    x_min = x_min_tmp if x_min_tmp <= x_min else x_min
    y_min = y_min_tmp if y_min_tmp <= y_min else y_min
    t_min = t_min_tmp if t_min_tmp <= t_min else t_min
    # Update maximums
    x_max_tmp, y_max_tmp, t_max_tmp = df["x"].max(), df["y"].max(), df["t"].max()
    x_max = x_max_tmp if x_max_tmp >= x_max else x_max
    y_max = y_max_tmp if y_max_tmp >= y_max else y_max
    t_max = t_max_tmp if t_max_tmp >= t_max else t_max

para["x_min"], para["y_min"], para["t_min"] = x_min, y_min, t_min
para["x_max"], para["y_max"], para["t_max"] = x_max, y_max, t_max

  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
""" Parameters """
with open("para.json", "w") as json_file:
    json.dump(para, json_file, default=json_numpy_serializer)