In [1]:
# Import libraries
from utils import get_taxi_data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sb
import geopandas
from meteostat import Point, Daily, Stations
import requests
from datetime import datetime

In [6]:
def import_taxi_zones(path):
    # import taxi zones from NYC Taxi & Limousine Commision, and set index
    zones = geopandas.read_file(path)
    zones = zones.set_index("OBJECTID")
    zones.index.rename("zone",inplace=True)
    
    # change coordinate reference system to New York specific
    # source: https://epsg.io/?q=new+york
    zones.to_crs("EPSG:32118")

    # find the centroid of each zone
    zones['centroid'] = zones.centroid
    
    # save zones within Manhattan
    manhattan = zones.loc[zones.borough.str.contains("Manhattan")].LocationID.to_numpy()
    
    return zones, manhattan

In [3]:
def count_pickup_dropoff(zones,df,column):
    # find all dates for which there is data
    dates = df.tpep_pickup_datetime.dt.date.unique()
    
    # initialise dataframe for drop-off and pick-up location counts for each number of days 
    freq = pd.DataFrame(np.zeros((len(dates),len(zones))),dtype=int,index=dates,
                        columns=zones.index)
    
    # count the number of drop-offs or pick-ups in each grid for each day
    for i in range(len(dates)):
        # count values for each location for each day
        freq.loc[i,:] = df.loc[df.tpep_pickup_datetime.dt.date == dates[i],column].value_counts()
    
    # fill NaN with zero, as value_counts cannot count zero
    freq.fillna(0,inplace=True)
    
    return freq

In [45]:
def calculate_daily_median(zones,df,column):
    # find all dates for which there is data
    dates = df.tpep_pickup_datetime.dt.date.unique()
    
    # initialise dataframe for median values for each number of days 
    median = pd.DataFrame(np.zeros((len(dates),len(zones))),dtype=int,index=dates,
                        columns=zones.index)
    
    # calculate median of column in each grid for each day
    for i in range(len(dates)):
        for j in range(len(zones)):
            # count values for each location for each day
            mask = (df.tpep_pickup_datetime.dt.date == dates[i])&(df.PULocationID == zones.index[j])
            median.loc[i,j] = df.loc[mask,column].median()
    
    return freq

In [17]:
# import taxi zones data
path = "data/taxi_zones/taxi_zones.shp"
zones, manhattan = import_taxi_zones(path)

In [46]:
# Taxi data import arguments
desired_columns = ['tpep_pickup_datetime','tpep_dropoff_datetime','trip_distance',
                   'fare_amount','PULocationID','DOLocationID'] #'passenger_count'
desired_year = '2019'
vehicle_type = 'yellow'
path = "~/Documents/PythonProject/data/taxi_data/"

# Import data for all months of 2019
for m in range(1,2):
    # convert month to two-digit format 
    if m > 9:
        month = f"{m}"
    else:
        month = f"0{m}"
    
    # import the taxi data with specified arguments
    print(f"Importing data for month {m}")
    df = get_taxi_data(desired_year, month, vehicles=vehicle_type, columns=desired_columns, save=True, localpath=path)
    
    # drop data for pick-ups outside manhattan
    df.drop(df[~df.PULocationID.isin(manhattan)].index, inplace=True)
    
    # drop data with pick-ups outside given month
    df.drop(df.loc[df.tpep_pickup_datetime < np.datetime64(f"2019-{month}-01 00:00:00")].index, inplace=True)
    df.drop(df.loc[df.tpep_pickup_datetime > np.datetime64(f"2019-{month}-31 23:59:59")].index, inplace=True)
    
    # find the daily drop-off and pick-up counts per zone
    print(f"Counting drop-off and pick-ups for month {m}")
    pickup_count = count_pickup_dropoff(zones,df,"PULocationID")
    dropoff_count = count_pickup_dropoff(zones,df,"DOLocationID")
    
    # calculate daily median per zone
    print(f"Calculating medians for month {m}")
    distance_median = calculate_daily_median(zones,df,"trip_distance")
    fare_median = calculate_daily_median(zones,df,"fare_amount")
    
    # concatenate data to form a single dataframe of all data
#     if m == 1:
#         df = import_df
#     else:
#         df = pd.concat((df,import_df),axis=0)
    
# # show dataframe memory usage
df.info()

Importing data for month 1
Counting drop-off and pick-ups for month 1
Calculating medians for month 1


KeyboardInterrupt: 

In [None]:
df.tpep_pickup_datetime.dt.date[0]

In [36]:
df.loc[df.tpep_pickup_datetime.dt.date == ,column].median()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_amount,PULocationID,DOLocationID
0,2019-01-01 00:46:40,2019-01-01 00:53:20,1.0,1.50,7.00,151,239
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1.0,2.60,14.00,239,246
7,2019-01-01 00:21:28,2019-01-01 00:28:37,1.0,1.30,6.50,163,229
8,2019-01-01 00:32:01,2019-01-01 00:45:39,1.0,3.70,13.50,229,7
9,2019-01-01 00:57:32,2019-01-01 01:09:32,2.0,2.10,10.00,141,234
...,...,...,...,...,...,...,...
7696608,2019-01-31 23:44:05,2019-02-01 00:17:23,,17.94,0.00,142,21
7696611,2019-01-31 23:14:00,2019-01-31 23:29:00,,5.45,28.95,41,136
7696612,2019-01-31 23:37:20,2019-02-01 00:10:43,,10.24,0.00,142,95
7696613,2019-01-31 23:28:00,2019-01-31 23:50:50,,12.43,48.80,48,213


In [None]:
pickup_count

In [None]:
start = datetime(2019,1,1)
end = datetime(2019,12,31)

data = Daily(newyork,start,end)
data = data.fetch()

data.loc[data.isna().any(axis=1)]

In [None]:
stations = Stations().nearby(lat,long).fetch()