In [79]:
import pandas as pd
import h5py
import numpy as np
from collections import namedtuple
import csv

filename = "/home/erynqian/10701/19F10701_Project/testData/assignment1_data-1.csv"
hdf5_filename = "/home/erynqian/10701/19F10701_Project/testData/assignment1_data-1.hdf5"
taxi_zone_lookup_chart = "/home/erynqian/10701/19F10701_Project/taxi _zone_lookup.csv"
datafields = ["VendorID","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","PULocationID","DOLocationID","payment_type"]

Data = namedtuple("Data", ["start_hour", "date", "day_of_week", "isHoliday", 
                           "start_zone_latitude", "start_zone_longitude", 
                           "end_zone_latitude", "end_zone_longitude", "distance", "ETA"])


In [52]:
"""Initialize taxi zone lookup dictionary"""
taxi_zone_lookup = {} # Key: LocationID; Value: ["Borough","Zone","service_zone"]
with open(taxi_zone_lookup_chart) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    first = 0
    for row in csv_reader:
        if first == 0:
            first += 1
        else:
            locationID, description = int(row[0]), row[1:]
            taxi_zone_lookup[locationID] = description

# TEST
print(taxi_zone_lookup[20], taxi_zone_lookup[259])


['Bronx', 'Belmont', 'Boro Zone'] ['Bronx', 'Woodlawn/Wakefield', 'Boro Zone']


In [76]:
class ParsedData:

    def __init__(self, row):
        self.row = row
    
    def start_hour(self):
        start_time = self.row[1].split(' ')[1]
        start_hour = start_time.split(':')[0]
        return int(start_hour)
    
    def date(self):
        """represent date as the kth day of the year; return k as int"""
        date = self.row[1].split(' ')[0]
        y,m,d = date.split('-')
        days_in_feb = 29 if int(y) % 4 == 0 else 28
        days_in_month = {1:31, 2:days_in_feb, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
        return sum([days_in_month[i] for i in range(1, int(m))]) + int(d)

    def day_of_week(self):
        date = self.row[1].split(' ')[0]
        y,m,d = [int(i) for i in date.split('-')]
        t = [ 0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4 ] 
        y -= m < 3
        return (( y + int(y / 4) - int(y / 100) + int(y / 400) + t[m - 1] + d) % 7) 

    def isHoliday(self):
        """    
        National holidays only
        reference: https://www.officeholidays.com/countries/usa/2017
                   https://www.officeholidays.com/countries/usa/2018
        """
        holidays2017 = [[1,2], [1,16], [5,29], [7,4], [9,4], [11,23], [12,25]]
        holidays2018 = [[1,1], [1,15], [5,28], [7,4], [9,3], [11,22], [12,25]]
        date = self.row[1].split(' ')[0]
        y,m,d = [int(i) for i in date.split('-')]
        holidays = holidays2017 if y == 2017 else holidays2018
        for h in holidays:
            if m == h[0] and d == h[1]:
                return 1
        return 0
    
    def zone_index_to_coordinates(self, index):
        #TODO: match zone index to (latitude, longitude)
        pass

    def start_and_end(self):
        """return 4 coordinates: start_latitude, start_longitude, end_latitude, end_longitude"""
        pickup, dropoff = int(self.row[4]), int(self.row[5])
        # print(taxi_zone_lookup[pickup], taxi_zone_lookup[dropoff])
        #TODO: uncomment the following return statement when zone_index_to_coordiantes is done
        # return self.zone_index_to_coordinates(pickup) + self.zone_index_to_coordiates(dropoff)
        return [-1, -1, -1, -1]

    def distance(self):
        #TODO: compute distance
        return float('inf')

    def ETA(self):
        """return ETA in min"""
        start = self.row[1].split(' ')[1].split(':')
        end = self.row[2].split(' ')[1].split(':')
        hour_diff = int(end[0]) - int(start[0])
        min_diff = int(end[1]) - int(start[1])
        sec_diff = int(end[1]) - int(start[1])
        return hour_diff * 60 + min_diff + round(sec_diff / 60, 3)

    def data(self):
        return [self.start_hour(), self.date(), self.day_of_week(), self.isHoliday()] + \
                self.start_and_end() + \
               [self.distance(), self.ETA()]


# TEST
entry = ['1', '2017-7-30 00:20:56', '2017-7-30 00:48:20', '1', '138', '265', '2']
d = ParsedData(entry)
print(d.data())

[0, 211, 0, 0, -1, -1, -1, -1, inf, 28.467]


In [90]:
"""parse csv"""
df = pd.read_csv(filename, sep=',')
file_len = len(df)
data_len = len(Data._fields)

with h5py.File(hdf5_filename, "w") as f:
    dset = f.create_dataset("mydataset", (file_len, data_len), dtype='f')

    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        r = -1
        for row in csv_reader:
            if r == -1: #skip first line
                r += 1
                continue
            else:
                parsedData = ParsedData(row)
                dset[r, :] = parsedData.data()
                r += 1

# TEST
with h5py.File(hdf5_filename, "r") as f:
    dset = f["mydataset"]
    print(dset.shape)
    print(dset[:10])