In this notebook I will preprocess the nyc taxi dataset as they do it in [Dutordoir]

In [0]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
from google.colab import drive
import sys
import os
drive.mount('/content/drive/')
root_path = 'drive/My Drive/Colab_Notebooks/normalizingflows'
trained_flows_folder = 'drive/My Drive/Colab_Notebooks/normalizingflows/trained_flows'
dataset_folder = 'drive/My Drive/Colab_Notebooks/normalizingflows/datasets'
sys.path.append(root_path)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Function def
Note code comes from 

https://github.com/hughsalimbeni/bayesian_benchmarks/blob/master/bayesian_benchmarks/data.py

In [0]:
rescale = lambda x, a, b: b[0] + (b[1] - b[0]) * x / (a[1] - a[0])


def convert_to_day_minute(d):
    day_of_week = rescale(float(d.weekday()), [0, 6], [0, 2 * np.pi])
    time_of_day = rescale(d.time().hour * 60 + d.time().minute, [0, 24 * 60], [0, 2 * np.pi])
    return day_of_week, time_of_day

def process_time(pickup_datetime, dropoff_datetime):
    d_pickup = datetime.strptime(pickup_datetime, "%Y-%m-%d %H:%M:%S")
    d_dropoff = datetime.strptime(dropoff_datetime, "%Y-%m-%d %H:%M:%S")
    duration = (d_dropoff - d_pickup).total_seconds()

    pickup_day_of_week, pickup_time_of_day = convert_to_day_minute(d_pickup)
    dropoff_day_of_week, dropoff_time_of_day = convert_to_day_minute(d_dropoff)

    return [pickup_day_of_week, pickup_time_of_day,
            dropoff_day_of_week, dropoff_time_of_day,
            duration]

# Preprocess data

In [0]:
taxi_dir = os.path.join(dataset_folder, 'NYC_yellow_taxi')
file_path = os.path.join(taxi_dir, 'yellow_tripdata.csv')

In [0]:
df = pd.read_csv(file_path)

In [8]:
df.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [9]:
x_bounds = [-74.04, -73.75]
y_bounds = [40.62, 40.86]
too_close_radius = 0.00001
min_duration = 30
max_duration = 3 * 3600
name = 'nytaxi'

#data = pandas.read_csv(self.datapath)#, nrows=10000)
data = df
data = data.values

# print(data.dtypes.index)
# 'id',  0
# 'vendor_id',  1
# 'pickup_datetime', 2
# 'dropoff_datetime',3
# 'passenger_count', 4
# 'pickup_longitude', 5
# 'pickup_latitude',6
# 'dropoff_longitude', 7
# 'dropoff_latitude', 8
# 'store_and_fwd_flag',9
# 'trip_duration'10

pickup_loc = np.array((data[:, 5], data[:, 6])).T
dropoff_loc = np.array((data[:, 7], data[:, 8])).T

ind = np.ones(len(data)).astype(bool)
ind[data[:, 5] < x_bounds[0]] = False
ind[data[:, 5] > x_bounds[1]] = False
ind[data[:, 6] < y_bounds[0]] = False
ind[data[:, 6] > y_bounds[1]] = False

ind[data[:, 7] < x_bounds[0]] = False
ind[data[:, 7] > x_bounds[1]] = False
ind[data[:, 8] < y_bounds[0]] = False
ind[data[:, 8] > y_bounds[1]] = False

print('discarding {} out of bounds {} {}'.format(np.sum(np.invert(ind).astype(int)), x_bounds,
                                                    y_bounds))

early_stop = ((data[:, 5] - data[:, 7]) ** 2 + (data[:, 6] - data[:, 8]) ** 2 < too_close_radius)
ind[early_stop] = False
print('discarding {} trip less than {} gp dist'.format(np.sum(early_stop.astype(int)),
                                                        too_close_radius ** 0.5))

times = np.array([process_time(d_pickup, d_dropoff) for (d_pickup, d_dropoff) in data[:, 2:4]])
pickup_time = times[:, :2]
dropoff_time = times[:, 2:4]
duration = times[:, 4]

short_journeys = (duration < min_duration)
ind[short_journeys] = False
print('discarding {} less than {}s journeys'.format(np.sum(short_journeys.astype(int)), min_duration))

long_journeys = (duration > max_duration)
ind[long_journeys] = False
print(
    'discarding {} more than {}h journeys'.format(np.sum(long_journeys.astype(int)), max_duration / 3600.))

pickup_loc = pickup_loc[ind, :]
dropoff_loc = dropoff_loc[ind, :]
pickup_time = pickup_time[ind, :]
dropoff_time = dropoff_time[ind, :]
duration = duration[ind]

print('{} total rejected journeys'.format(np.sum(np.invert(ind).astype(int))))

discarding 14125 out of bounds [-74.04, -73.75] [40.62, 40.86]
discarding 23089 trip less than 0.0031622776601683794 gp dist
discarding 4782 less than 30s journeys
discarding 2112 more than 3.0h journeys
38576 total rejected journeys


In [0]:
pickup_sc = np.array([np.sin(pickup_time[:, 0]),
                        np.cos(pickup_time[:, 0]),
                        np.sin(pickup_time[:, 1]),
                        np.cos(pickup_time[:, 1])]).T

X = np.concatenate([pickup_loc, dropoff_loc, pickup_sc], 1)
Y = duration.reshape(-1, 1)
X, Y = np.array(X).astype(float), np.array(Y).astype(float)

In [11]:
X.shape

(1420068, 8)

In [0]:
cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'pickup_dow_sin', 'pickup_dow_cos', 'pickup_tod_sin', 'pickup_tod_cos']
processed_df = pd.DataFrame(X, columns=cols)

In [13]:
processed_df

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_dow_sin,pickup_dow_cos,pickup_tod_sin,pickup_tod_cos
0,-73.982155,40.767937,-73.964630,40.765602,0.000000e+00,1.0,-0.987688,-0.156434
1,-73.980415,40.738564,-73.999481,40.731152,-2.449294e-16,1.0,0.186524,0.982450
2,-73.979027,40.763939,-74.005333,40.710087,8.660254e-01,0.5,0.108867,-0.994056
3,-74.010040,40.719971,-74.012268,40.706718,8.660254e-01,-0.5,-0.920505,0.390731
4,-73.973053,40.793209,-73.972923,40.782520,-8.660254e-01,0.5,-0.382683,-0.923880
...,...,...,...,...,...,...,...,...
1420063,-73.982201,40.745522,-73.994911,40.740170,-8.660254e-01,-0.5,-0.386711,-0.922201
1420064,-74.000946,40.747379,-73.970184,40.796547,-2.449294e-16,1.0,0.915311,-0.402747
1420065,-73.959129,40.768799,-74.004433,40.707371,-8.660254e-01,-0.5,0.969231,-0.246153
1420066,-73.982079,40.749062,-73.974632,40.757107,8.660254e-01,0.5,-0.857167,-0.515038


In [0]:
save_file = os.path.join(taxi_dir, 'processed_nyc_taxi.csv')

In [0]:
processed_df.to_csv(save_file, index=False)

In [17]:
load_test = pd.read_csv(save_file)
load_test

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_dow_sin,pickup_dow_cos,pickup_tod_sin,pickup_tod_cos
0,-73.982155,40.767937,-73.964630,40.765602,0.000000e+00,1.0,-0.987688,-0.156434
1,-73.980415,40.738564,-73.999481,40.731152,-2.449294e-16,1.0,0.186524,0.982450
2,-73.979027,40.763939,-74.005333,40.710087,8.660254e-01,0.5,0.108867,-0.994056
3,-74.010040,40.719971,-74.012268,40.706718,8.660254e-01,-0.5,-0.920505,0.390731
4,-73.973053,40.793209,-73.972923,40.782520,-8.660254e-01,0.5,-0.382683,-0.923880
...,...,...,...,...,...,...,...,...
1420063,-73.982201,40.745522,-73.994911,40.740170,-8.660254e-01,-0.5,-0.386711,-0.922201
1420064,-74.000946,40.747379,-73.970184,40.796547,-2.449294e-16,1.0,0.915311,-0.402747
1420065,-73.959129,40.768799,-74.004433,40.707371,-8.660254e-01,-0.5,0.969231,-0.246153
1420066,-73.982079,40.749062,-73.974632,40.757107,8.660254e-01,0.5,-0.857167,-0.515038


In [0]:
processed_df_small = processed_df.sample(n=400000, random_state=42)


In [0]:
save_file_small = os.path.join(taxi_dir, 'processed_nyc_taxi_small.csv')

In [0]:
processed_df_small.to_csv(save_file_small, index=False)

In [0]:
loaded_df = pd.read_csv(save_file_small)

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader

In [0]:
obs_cols = ['pickup_longitude', 'pickup_latitude']
context_cols = ['dropoff_longitude', 'dropoff_latitude', 'pickup_dow_sin', 'pickup_dow_cos', 'pickup_tod_sin', 'pickup_tod_cos']
batch_size = 50000

In [0]:
train_idx, test_idx = train_test_split(loaded_df.index, test_size=0.2, random_state=42)

In [0]:
obs_scaler = StandardScaler().fit(loaded_df.loc[train_idx, obs_cols])


In [0]:
context_scaler = StandardScaler().fit(loaded_df.loc[train_idx, context_cols])

In [0]:

scaled_train_data = torch.tensor(obs_scaler.transform(loaded_df.loc[train_idx, obs_cols])).float()


In [0]:
scaled_test_data = torch.tensor(obs_scaler.transform(loaded_df.loc[test_idx, obs_cols])).float()

In [0]:
train_dataloader = DataLoader(scaled_train_data, batch_size=batch_size, shuffle=True)

In [0]:
test_dataloader = DataLoader(scaled_test_data, batch_size=batch_size)