In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
# Read data from trajectory files

ts = []
names = ['latitude', 'longitude', 'occupied', 'time']

for root, _, filenames in os.walk('./datasets/cabspotting/'):
    for filename in filenames:
        if filename.startswith('new_'):
            path = os.path.join(root, filename)
            taxi = filename[4:-4]
            
            t = pd.read_csv(path, names=names, skiprows=6, index_col=False, sep=' ')
    
            t['time'] = pd.to_datetime(t['time'], unit='s')
            t['taxi'] = taxi
            
            ts.append(t)

df = pd.concat(ts, ignore_index=True)

In [3]:
df.shape

(11216739, 5)

In [4]:
df.head()

Unnamed: 0,latitude,longitude,occupied,time,taxi
0,37.58817,-122.3575,0,2008-06-09 18:38:09,adkavy
1,37.58696,-122.34099,0,2008-06-09 18:37:37,adkavy
2,37.58176,-122.3258,0,2008-06-09 18:36:36,adkavy
3,37.57136,-122.31451,0,2008-06-09 18:35:42,adkavy
4,37.56099,-122.30327,0,2008-06-09 18:34:58,adkavy


In [5]:
df['taxi'].unique().shape

(536,)

In [9]:
df['time'].describe()

count                11216739
unique                2023173
top       2008-05-24 07:03:55
freq                      414
first     2008-05-17 10:00:04
last      2008-06-10 09:18:30
Name: time, dtype: object

In [14]:
# Change the way the dataset is indexed

df_reindexed = df.set_index(['taxi', 'time'])
df.sort_index(inplace=True)

In [15]:
df_reindexed.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,occupied
taxi,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
adkavy,2008-06-09 18:38:09,37.58817,-122.3575,0
adkavy,2008-06-09 18:37:37,37.58696,-122.34099,0
adkavy,2008-06-09 18:36:36,37.58176,-122.3258,0
adkavy,2008-06-09 18:35:42,37.57136,-122.31451,0
adkavy,2008-06-09 18:34:58,37.56099,-122.30327,0
adkavy,2008-06-09 18:34:16,37.55003,-122.29296,0
adkavy,2008-06-09 18:33:10,37.53917,-122.28291,0
adkavy,2008-06-09 18:32:22,37.52851,-122.27257,0
adkavy,2008-06-09 18:31:33,37.5182,-122.2609,0
adkavy,2008-06-09 18:30:46,37.50827,-122.24961,0


In [16]:
# Save the dataset to .pkl

df_reindexed.to_pickle('./datasets/cabspotting.pkl')