In [1]:
import os
import urllib
import zipfile
import itertools
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import stats
import dask.array as da
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import shapefile
from shapely.geometry import Polygon
from descartes.patch import PolygonPatch
from matplotlib import pyplot as plt
from matplotlib import style
import seaborn as sns
style.use('ggplot')

PARENT_DIR = Path().resolve().parents[0] 
DATA_DIR = PARENT_DIR / 'data'
DATA_URL = 'https://s3.amazonaws.com/nyc-tlc/trip+data/'

data_name = 'tripdata'
companies = ['yellow']
years = ['2017']
months = ['03', '06', '11']
cos = ''.join([c[0] for c in set(companies)])
yrs = ''.join([y[-2:] for y in set(years)])
mos = ''.join(months)

parquet_name = DATA_DIR / '_'.join([cos, yrs, mos])

lookup_name = DATA_DIR / 'taxi_zone_lookup.csv'
shapefile_name = DATA_DIR / 'taxi_zones.zip'

numerical_cols = ['passenger_count', 'trip_distance', 'fare_amount',
                  'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
                  'improvement_surcharge', 'total_amount']

categorical_cols = ['VendorID', 'RatecodeID',
                    'store_and_fwd_flag', 'payment_type']

location_cols = ['PULocationID', 'DOLocationID']

datetime_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']

## Exploration of location data

In [2]:
def get_lat_lon(sf):
    content = []
    for sr in sf.shapeRecords():
        shape = sr.shape
        rec = sr.record
        loc_id = rec[shp_dic['LocationID']]
        
        x = (shape.bbox[0]+shape.bbox[2])/2
        y = (shape.bbox[1]+shape.bbox[3])/2
        
        content.append((loc_id, x, y))
    return pd.DataFrame(content, columns=["LocationID", "longitude", "latitude"])

In [3]:
sf = shapefile.Reader(str(DATA_DIR / 'taxi_zones.shp'))
fields_name = [field[0] for field in sf.fields[1:]]
shp_dic = dict(zip(fields_name, list(range(len(fields_name)))))
attributes = sf.records()
shp_attr = [dict(zip(fields_name, attr)) for attr in attributes]

df_loc = pd.DataFrame(shp_attr).join(get_lat_lon(sf).set_index("LocationID"), on="LocationID")
df_loc.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,longitude,latitude
0,1,0.116357,0.000782,Newark Airport,1,EWR,936681.7,190522.130278
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,1033536.0,161853.9823
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,1027136.0,254730.010849
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,990424.0,203100.040432
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,932133.2,139954.541936


In [None]:
%%time
df = dd.read_parquet(parquet_name)