In [1]:
import os
import glob
from zipfile import ZipFile

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', 500)

import vaex

from tqdm import tqdm_notebook as tqdm


In [2]:
# Columns to read
columns = ['Total_amount', 'Dropoff_longitude', 'Passenger_count', 'VendorID', 'Extra', 
           'Tip_amount','Pickup_longitude', 'Store_and_fwd_flag', 'Payment_type',
           'MTA_tax', 'Fare_amount', 'Pickup_latitude', 'Dropoff_latitude', 'Tolls_amount',
           'Trip_distance', 'Improvement_surcharge', 'RateCodeID', 'tpep_dropoff_datetime', 
           'tpep_pickup_datetime']

# Force dtypes for these columns - some choices are made to optimize
dtypes = {
'VendorID': np.int32,
'tpep_pickup_datetime': np.int16,
'tpep_dropoff_datetime': np.int16,
'Passenger_count': np.int16,
'Trip_distance': np.int16,
'Pickup_longitude': np.int32,
'Pickup_latitude': np.int32,
'RateCodeID': np.int32,
'Store_and_fwd_flag': np.int16,
'Dropoff_longitude': np.int32,
'Dropoff_latitude': np.int32,
'Payment_type': np.object,
'Fare_amount': np.int8,
'Extra': np.int8,
'MTA_tax': np.int8,
'Improvement_surcharge': np.int8,
'Tip_amount': np.int8,
'Tolls_amount': np.int8,
'Total_amount': np.int8,    
}

# Setup renaming dictionary
rename_dict = {
        'tpep_pickup_datetime' : 'PickupDateTime',
        'tpep_dropoff_datetime': 'DropoffDateTime',
        'Passenger_count': 'Passenger#',
        'Pickup_longitude' : 'PickupLong',
        'Pickup_latitude': 'Pickuplat',
        'Dropoff_longitude': 'DropoffLong',
        'Dropoff_latitude': 'DropoffLat',
}



In [3]:
# Set up the list of zip files to be opened and converted
zip_list = np.sort(np.array(glob.glob('yellowtd_2009_1.zip')))[::-1]

# The output directory
output_dir = './yellowtd_2009_1/hdf5/'

In [4]:
# The magic happens here:
for file in tqdm(zip_list, leave=False, desc='Converting csv to hdf5..'):
    # Setting up the files, and directories
    zip_file = ZipFile(file)
    output_file = file.split('/')[-1][:-3] + 'hdf5'
    output = output_dir + output_file
    
    # Check if a converted file already exists: if it does then skip it,
    if (os.path.exists(output) and os.path.isfile(output)):
        pass
    else:
        # Importing the data into pandas
        pandas_df = [pd.read_csv(zip_file.open(text_file.filename),
                                 encoding='latin',
                                 usecols=None,
                                 dtype=dtypes,)
                    for text_file in zip_file.infolist()
                    if text_file.filename.endswith('.csv')][0]
        # Rename some columns to match the more well known dataset from
        # http://stat-computing.org/dataexpo/2009/the-data.html
        pandas_df.rename(columns=rename_dict, inplace=True)
                 
        # Importing the data from pandas to vaex
        vaex_df = vaex.from_pandas(pandas_df, copy_index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for file in tqdm(zip_list, leave=False, desc='Converting csv to hdf5..'):


HBox(children=(HTML(value='Converting csv to hdf5..'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

In [8]:
import re
import glob
import vaex
import numpy as np

def tryint(s):
    try:
        return int(s)
    except:
        return s
    
def alphanum_key(s):
    """Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]

In [6]:
hdf5_list = glob.glob('./yellowtd_2009_1/hdf5/')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)



In [7]:
# This is an important step
master_df = vaex.open_many(hdf5_list)

# exporting
master_df.export_hdf5(path='./yellowtd_2009_1/hdf5/', progress=True)

IndexError: list index out of range