In [1]:
import pandas as pd
import h5py
import numpy as np
from collections import namedtuple
import csv
import requests


taxi_zone_lookup_chart = "/home/erynqian/10701/19F10701_Project/taxi _zone_lookup.csv"
taxi_zone_geolocation_file = "/home/erynqian/10701/19F10701_Project/taxi_zone_lookup.hdf5"
datafields = ["VendorID","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","PULocationID","DOLocationID","payment_type"]
key = "6fW8tAG2L3VWCJYeKQ0IwgBzJNBJpoDZ"

Data = namedtuple("Data", ["start_hour", "date", "day_of_week", "isHoliday", 
                           "start_zone_latitude", "start_zone_longitude", 
                           "end_zone_latitude", "end_zone_longitude", "distance", "ETA"])


In [2]:
"""Fetch taxi zone geo coordinates from online API"""
# with open(taxi_zone_lookup_chart) as csv_file:
#     csv_reader = csv.reader(csv_file, delimiter=',')
#     first = 0
#     for row in csv_reader:
#         if first == 0:
#             first += 1
#         else:
#             locationID, location = int(row[0]), row[1:]
#             location = location[0] + ", " + location[1]
#             url = "http://open.mapquestapi.com/geocoding/v1/address?key=" + key + "&location=" + location
#             response = requests.get(url)
#             taxi_zone_lookup[locationID] = response.json()["results"][0]["locations"][0]["latLng"]

"""Store geo coordinates to hdf5 file"""
# df = pd.read_csv(taxi_zone_lookup_chart, sep=',')
# with h5py.File(taxi_zone_geolocation_file, "w") as f:
#     dset = f.create_dataset("taxi_zone_geolocation", (len(df), 3), dtype='f')
#     for i in range(1, len(df)+1):
#         dset[i-1] = [i, taxi_zone_lookup[i]["lat"], taxi_zone_lookup[i]["lng"]]

"""Look up latitude and longitude of the taxi zone of given index"""
def lookup_taxi_zone(index):
    with h5py.File(taxi_zone_geolocation_file, "r") as f:
        dset = f["taxi_zone_geolocation"]
        _, lat, lng = dset[index-1]
        return lat, lng


# TEST
# for i in range(1, 266):
#     print(i, lookup_taxi_zone(i))
# print(lookup_taxi_zone(265))
# print(lookup_taxi_zone(12))


In [3]:
class ParsedData:

    def __init__(self, row):
        self.row = row
    
    def start_hour(self):
        start_time = self.row[1].split(' ')[1]
        start_hour = start_time.split(':')[0]
        return int(start_hour)
    
    def date(self):
        """represent date as the kth day of the year; return k as int"""
        date = self.row[1].split(' ')[0]
        y,m,d = date.split('-')
        days_in_feb = 29 if int(y) % 4 == 0 else 28
        days_in_month = {1:31, 2:days_in_feb, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
        return sum([days_in_month[i] for i in range(1, int(m))]) + int(d)

    def day_of_week(self):
        date = self.row[1].split(' ')[0]
        y,m,d = [int(i) for i in date.split('-')]
        t = [ 0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4 ] 
        y -= m < 3
        return (( y + int(y / 4) - int(y / 100) + int(y / 400) + t[m - 1] + d) % 7) 

    def isHoliday(self):
        """    
        National holidays only
        reference: https://www.officeholidays.com/countries/usa/2017
                   https://www.officeholidays.com/countries/usa/2018
        """
        holidays2017 = [[1,2], [1,16], [5,29], [7,4], [9,4], [11,23], [12,25]]
        holidays2018 = [[1,1], [1,15], [5,28], [7,4], [9,3], [11,22], [12,25]]
        date = self.row[1].split(' ')[0]
        y,m,d = [int(i) for i in date.split('-')]
        holidays = holidays2017 if y == 2017 else holidays2018
        for h in holidays:
            if m == h[0] and d == h[1]:
                return 1
        return 0

    def start_end_distance(self):
        """return 4 coordinates: start_latitude, start_longitude, end_latitude, end_longitude"""
        pickup, dropoff = int(self.row[4]), int(self.row[5])
        PU_lat, PU_lng = lookup_taxi_zone(pickup)
        DO_lat, DO_lng = lookup_taxi_zone(dropoff)
        distance = (PU_lat - DO_lat) ** 2 + (PU_lng - DO_lng) ** 2
        return [PU_lat, PU_lng, DO_lat, DO_lng, distance]

    def ETA(self):
        """return ETA in min"""
        start = self.row[1].split(' ')[1].split(':')
        end = self.row[2].split(' ')[1].split(':')
        hour_diff = int(end[0]) - int(start[0])
        min_diff = int(end[1]) - int(start[1])
        sec_diff = int(end[1]) - int(start[1])
        return hour_diff * 60 + min_diff + round(sec_diff / 60, 3)

    def data(self):
        return [self.start_hour(), self.date(), self.day_of_week(), self.isHoliday()] + \
                self.start_end_distance() + [self.ETA()]


# TEST
entry = ['1', '2017-07-30 00:27:25', '2017-07-30 00:39:09', '1', '170', '48', '1']
d = ParsedData(entry)
for field, data in zip(Data._fields, d.data()):
    print(field, ":", data)

start_hour : 0
date : 211
day_of_week : 0
isHoliday : 0
start_zone_latitude : 40.748158
start_zone_longitude : -73.97875
end_zone_latitude : 40.757324
end_zone_longitude : -73.995415
distance : 0.0003616708709159866
ETA : 12.2


In [4]:
"""parse csv"""
filename = "/home/erynqian/10701/19F10701_Project/testData/assignment1_data-1.csv"

def parse_csv(filename):
    print("processing", filename.split('/')[-1], "...")
    df = pd.read_csv(filename, sep=',')
    file_len = len(df)
    data_len = len(Data._fields)

    hdf5_filename = filename.split('.')[0] + '.hdf5'
    with h5py.File(hdf5_filename, "w") as f:
        dset = f.create_dataset("mydataset", (file_len, data_len), dtype='f')
        parsedData = 0

        with open(filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            r = -1
            for row in csv_reader:
                if r == -1: #skip first line
                    r += 1
                    continue
                else:
                    parsedData = ParsedData(row)
                    dset[r, :] = parsedData.data()
                    r += 1
                if r % 10000 == 0:
                    print(r, "rows processed.")

In [6]:
files = ["/home/erynqian/10701/19F10701_Project/testData/assignment1_data-1.csv",
"/home/erynqian/10701/19F10701_Project/testData/assignment1_data-90.csv",
"/home/erynqian/10701/19F10701_Project/testData/assignment1_data-140.csv",
"/home/erynqian/10701/19F10701_Project/testData/assignment1_data-629.csv",
"/home/erynqian/10701/19F10701_Project/testData/assignment1_data-664.csv"]

# for filename in files:
#     parse_csv(filename)

In [9]:
# TEST
for filename in files:
    hdf5_filename = filename.split('.')[0] + '.hdf5'
    with h5py.File(hdf5_filename, "r") as f:
        dset = f["mydataset"]
        print(hdf5_filename)
        print(dset.shape)
        print(dset[:3])
        print(dset[-3:])

/home/erynqian/10701/19F10701_Project/testData/assignment1_data-1.hdf5
(99999, 10)
[[   0.        211.          0.          0.         39.78373  -100.445885
    42.71777   -78.967255  469.94016    28.467   ]
 [   0.        211.          0.          0.         39.78373  -100.445885
    40.772015  -73.93027   704.0547     18.3     ]
 [   0.        211.          0.          0.         39.54194  -119.7895
    39.78373  -100.445885  374.23383     3.05    ]]
[[ 1.20000000e+01  2.11000000e+02  0.00000000e+00  0.00000000e+00
   4.07556648e+01 -7.40005493e+01  4.07870445e+01 -7.39754181e+01
   1.61626399e-03  1.32170000e+01]
 [ 1.20000000e+01  2.11000000e+02  0.00000000e+00  0.00000000e+00
   4.07556648e+01 -7.40005493e+01  4.07481575e+01 -7.39787521e+01
   5.31476981e-04  1.32329998e+01]
 [ 1.20000000e+01  2.11000000e+02  0.00000000e+00  0.00000000e+00
   3.95419388e+01 -1.19789497e+02  3.97837296e+01 -1.00445885e+02
   3.74233826e+02  1.01669998e+01]]
/home/erynqian/10701/19F10701_Project/tes