In [1]:
import pandas as pd
import numpy as np

import os

In [3]:
# Read data from trajectory files

ts = []
names = ['taxi', 'time', 'longitude', 'latitude']

for root, _, filenames in os.walk('t-drive/original/taxi_log_2008_by_id/'):
    for filename in filenames:
        if filename.endswith('.txt'):
            path = os.path.join(root, filename)
            
            t = pd.read_csv(path, names=names, index_col=False)
            
            t['time'] = pd.to_datetime(t['time'], format = '%Y-%m-%d %H:%M:%S')
            
            ts.append(t)

df = pd.concat(ts, ignore_index=True)

In [4]:
df.shape

(17662984, 4)

In [5]:
df.head()

Unnamed: 0,taxi,time,longitude,latitude
0,3644,2008-02-02 13:40:59,116.37497,39.85789
1,3644,2008-02-02 13:42:11,116.37542,39.85764
2,3644,2008-02-02 13:42:49,116.37727,39.85775
3,3644,2008-02-02 13:42:59,116.37746,39.85787
4,3644,2008-02-02 13:44:00,116.37749,39.85817


In [6]:
df['taxi'].unique().shape

(10336,)

In [7]:
df['time'].describe()

count                17662984
unique                 525939
top       2008-02-08 14:16:12
freq                      381
first     2008-02-02 13:30:44
last      2008-02-08 17:39:19
Name: time, dtype: object

In [8]:
# Change the way the dataset is indexed

df_reindexed = df.set_index(['taxi', 'time'])
df.sort_index(inplace=True)

In [9]:
df_reindexed.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,longitude,latitude
taxi,time,Unnamed: 2_level_1,Unnamed: 3_level_1
3644,2008-02-02 13:40:59,116.37497,39.85789
3644,2008-02-02 13:42:11,116.37542,39.85764
3644,2008-02-02 13:42:49,116.37727,39.85775
3644,2008-02-02 13:42:59,116.37746,39.85787
3644,2008-02-02 13:44:00,116.37749,39.85817
3644,2008-02-02 13:45:00,116.37744,39.85812
3644,2008-02-02 13:45:30,116.37744,39.85802
3644,2008-02-02 13:46:20,116.37754,39.85815
3644,2008-02-02 13:47:01,116.37757,39.85817
3644,2008-02-02 13:47:01,116.37757,39.85817


In [10]:
# Save the dataset to .pkl

df_reindexed.to_pickle('t-drive/t-drive.pkl')