This notebook is used to add other features into the counted trip data.   
At the end of the project id will be merged into the main file.

In [None]:
import geopandas as gpd
import pandas as pd
import os

In [None]:
# get the center of each zone
def get_nyczones_center(filepath):
    # Read shapefile into geopandas dataframe
    nyczones = gpd.read_file(filepath)
    nyczones = nyczones.to_crs(epsg=4326)  # Convert to WGS84

    # calculate the center of each zone
    nyczones['center'] = nyczones['geometry'].centroid
    nyczones['center_lat'] = nyczones['center'].apply(lambda p: p.y)
    nyczones['center_lon'] = nyczones['center'].apply(lambda p: p.x)

    return nyczones

In [21]:
# Read in the counted trip data and add the features we need
def add_features(folder_path):
    '''
    input: folder_path - the path to the folder containing the counted data
    output: saves the counted data with the added features to the folder
    '''
    # get other features
    ## the center longtitude and latitude of OD zones
    nyczones = get_nyczones_center('../data/taxi_zones/taxi_zones.shp')

    ## the weather data
    weather = {}
    for filename in os.listdir('../data/weather_data/csv/merged/'):
        if filename.endswith(".csv"):
            weather_path = os.path.join('../data/weather_data/csv/merged/', filename)
            weather_data = pd.read_csv(weather_path)
            weather[filename[13:-6]] = weather_data  # the year is the key
            

    # read in the counted data from the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)

            # merge the lat and lon of the center of the PULocation and DOLocation
            df = df.merge(nyczones[['LocationID', 'center_lat', 'center_lon']], left_on='PULocationID', right_on='LocationID', how='left')
            df = df.rename(columns={'center_lat': 'PULocation_lat', 'center_lon': 'PULocation_lon'})
            df = df.drop(columns=['LocationID'])
            df = df.merge(nyczones[['LocationID', 'center_lat', 'center_lon']], left_on='DOLocationID', right_on='LocationID', how='left')
            df = df.rename(columns={'center_lat': 'DOLocation_lat', 'center_lon': 'DOLocation_lon'})
            df = df.drop(columns=['LocationID'])
            df = df.drop(columns=['PULocationID', 'DOLocationID'])

            # merge the weather data
            # merge the weather data
            df = df.merge(weather[filename[-11:-7]], left_on=['day','hour'], right_on=['date','time'], how='left') #the year is the key
            df = df.drop(columns=['date','time'])

            # save the data
            df.to_csv(file_path.replace('countdata','mergedata'), index=False)
            print('saved', file_path.replace('countdata','mergedata'))

    print('Done!')
    return None
            

In [None]:
add_features('../data/processed_nyc_data/')