This notebook is used to add other features into the counted trip data.   
At the end of the project id will be merged into the main file.

In [None]:
import geopandas as gpd
import pandas as pd
import os

In [None]:
# get the center of each zone
def get_nyczones_center(filepath):
    # Read shapefile into geopandas dataframe
    nyczones = gpd.read_file(filepath)
    nyczones = nyczones.to_crs(epsg=4326)  # Convert to WGS84

    # calculate the center of each zone
    nyczones['center'] = nyczones['geometry'].centroid
    nyczones['center_lat'] = nyczones['center'].apply(lambda p: p.y)
    nyczones['center_lon'] = nyczones['center'].apply(lambda p: p.x)

    return nyczones

In [None]:
# Read in the counted trip data and add the features we need
def add_features(folder_path):
    '''
    input: folder_path - the path to the folder containing the counted data
    output: saves the counted data with the added features to the folder
    '''
    # get other features
    ## the center longtitude and latitude of OD zones
    nyczones = get_nyczones_center('../data/taxi_zones/taxi_zones.shp')


    # read in the counted data from the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)

            # merge the lat and lon of the center of the PULocation and DOLocation
            df = df.merge(nyczones[['LocationID', 'center_lat', 'center_lon']], left_on='PULocationID', right_on='LocationID', how='left')
            df = df.rename(columns={'center_lat': 'PULocation_lat', 'center_lon': 'PULocation_lon'})
            df = df.drop(columns=['LocationID'])
            df = df.merge(nyczones[['LocationID', 'center_lat', 'center_lon']], left_on='DOLocationID', right_on='LocationID', how='left')
            df = df.rename(columns={'center_lat': 'DOLocation_lat', 'center_lon': 'DOLocation_lon'})
            df = df.drop(columns=['LocationID'])
            df = df.drop(columns=['PULocationID', 'DOLocationID'])

            # merge the weather data

            
    return df
            

In [None]:
# test the visualizating tool of the counted data
# creat test dataset

def create_test_data(df, day, hour):
    '''
    input: df - the counted data with the added features
           day - the day of the data
           hour - the hour of the data
    output: save the test data to the folder
    '''
    # only keep the columns: PULocation_lat, PULocation_lon, DOLocation_lat, DOLocation_lon, day, hour, passenger_turnover
    df = df[['PULocation_lon', 'PULocation_lat', 'DOLocation_lon', 'DOLocation_lat', 'day', 'hour', 'passenger_turnover']]

    # select data which day is day, hour is hour
    df_day_hour = df[(df['day'] == day) & (df['hour'] == hour)]

    # drop day and hour columns for the datasets
    df_day_hour = df_day_hour.drop(columns=['day', 'hour'])

    # save the datasets
    df_day_hour.to_csv('../data/processed_nyc_data/test/df' + str(day) + '_' + str(hour) + '.csv', index=False)

    return f'Success create test data for day {day} and hour {hour}'


In [None]:
# example of using the create_test_data function
df = add_features('../data/processed_nyc_data/2018/')
create_test_data(df, 11, 8)
